YarrParser.h [plain text]

/*
 * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#pragma once

#include "Yarr.h"
#include "YarrPattern.h"
#include "YarrUnicodeProperties.h"
#include <wtf/ASCIICType.h>
#include <wtf/HashSet.h>
#include <wtf/Optional.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/text/WTFString.h>

namespace JSC { namespace Yarr {

// The Parser class should not be used directly - only via the Yarr::parse() method.
template<class Delegate, typename CharType>
class Parser {
private:
    template<class FriendDelegate>
    friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit);

    /*
     * CharacterClassParserDelegate:
     *
     * The class CharacterClassParserDelegate is used in the parsing of character
     * classes.  This class handles detection of character ranges.  This class
     * implements enough of the delegate interface such that it can be passed to
     * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused
     * to perform the parsing of escape characters in character sets.
     */
    class CharacterClassParserDelegate {
    public:
        CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
            : m_delegate(delegate)
            , m_errorCode(err)
            , m_state(Empty)
            , m_character(0)
        {
        }

        /*
         * begin():
         *
         * Called at beginning of construction.
         */
        void begin(bool invert)
        {
            m_delegate.atomCharacterClassBegin(invert);
        }

        /*
         * atomPatternCharacter():
         *
         * This method is called either from parseCharacterClass() (for an unescaped
         * character in a character class), or from parseEscape(). In the former case
         * the value true will be passed for the argument 'hyphenIsRange', and in this
         * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
         * is different to /[a\-z]/).
         */
        void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false)
        {
            switch (m_state) {
            case AfterCharacterClass:
                // Following a builtin character class we need look out for a hyphen.
                // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
                // If we see a hyphen following a charater class then unlike usual
                // we'll report it to the delegate immediately, and put ourself into
                // a poisoned state. Any following calls to add another character or
                // character class will result in an error. (A hypen following a
                // character-class is itself valid, but only  at the end of a regex).
                if (hyphenIsRange && ch == '-') {
                    m_delegate.atomCharacterClassAtom('-');
                    m_state = AfterCharacterClassHyphen;
                    return;
                }
                // Otherwise just fall through - cached character so treat this as Empty.
                FALLTHROUGH;

            case Empty:
                m_character = ch;
                m_state = CachedCharacter;
                return;

            case CachedCharacter:
                if (hyphenIsRange && ch == '-')
                    m_state = CachedCharacterHyphen;
                else {
                    m_delegate.atomCharacterClassAtom(m_character);
                    m_character = ch;
                }
                return;

            case CachedCharacterHyphen:
                if (ch < m_character) {
                    m_errorCode = ErrorCode::CharacterClassOutOfOrder;
                    return;
                }
                m_delegate.atomCharacterClassRange(m_character, ch);
                m_state = Empty;
                return;

                // See coment in atomBuiltInCharacterClass below.
                // This too is technically an error, per ECMA-262, and again we
                // we chose to allow this.  Note a subtlely here that while we
                // diverge from the spec's definition of CharacterRange we do
                // remain in compliance with the grammar.  For example, consider
                // the expression /[\d-a-z]/.  We comply with the grammar in
                // this case by not allowing a-z to be matched as a range.
            case AfterCharacterClassHyphen:
                m_delegate.atomCharacterClassAtom(ch);
                m_state = Empty;
                return;
            }
        }

        /*
         * atomBuiltInCharacterClass():
         *
         * Adds a built-in character class, called by parseEscape().
         */
        void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
        {
            switch (m_state) {
            case CachedCharacter:
                // Flush the currently cached character, then fall through.
                m_delegate.atomCharacterClassAtom(m_character);
                FALLTHROUGH;
            case Empty:
            case AfterCharacterClass:
                m_state = AfterCharacterClass;
                m_delegate.atomCharacterClassBuiltIn(classID, invert);
                return;

                // If we hit either of these cases, we have an invalid range that
                // looks something like /[x-\d]/ or /[\d-\d]/.
                // According to ECMA-262 this should be a syntax error, but
                // empirical testing shows this to break teh webz.  Instead we
                // comply with to the ECMA-262 grammar, and assume the grammar to
                // have matched the range correctly, but tweak our interpretation
                // of CharacterRange.  Effectively we implicitly handle the hyphen
                // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
            case CachedCharacterHyphen:
                m_delegate.atomCharacterClassAtom(m_character);
                m_delegate.atomCharacterClassAtom('-');
                FALLTHROUGH;
            case AfterCharacterClassHyphen:
                m_delegate.atomCharacterClassBuiltIn(classID, invert);
                m_state = Empty;
                return;
            }
        }

        /*
         * end():
         *
         * Called at end of construction.
         */
        void end()
        {
            if (m_state == CachedCharacter)
                m_delegate.atomCharacterClassAtom(m_character);
            else if (m_state == CachedCharacterHyphen) {
                m_delegate.atomCharacterClassAtom(m_character);
                m_delegate.atomCharacterClassAtom('-');
            }
            m_delegate.atomCharacterClassEnd();
        }

        // parseEscape() should never call these delegate methods when
        // invoked with inCharacterClass set.
        NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
        NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
        NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
        NO_RETURN_DUE_TO_ASSERT bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
        NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }

    private:
        Delegate& m_delegate;
        ErrorCode& m_errorCode;
        enum CharacterClassConstructionState {
            Empty,
            CachedCharacter,
            CachedCharacterHyphen,
            AfterCharacterClass,
            AfterCharacterClassHyphen,
        } m_state;
        UChar32 m_character;
    };

    Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit)
        : m_delegate(delegate)
        , m_backReferenceLimit(backReferenceLimit)
        , m_data(pattern.characters<CharType>())
        , m_size(pattern.length())
        , m_isUnicode(isUnicode)
    {
    }

    // The handling of IdentityEscapes is different depending on the unicode flag.
    // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
    // For non-unicode patterns, most any character can be escaped.
    bool isIdentityEscapeAnError(int ch)
    {
        if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) {
            m_errorCode = ErrorCode::InvalidIdentityEscape;
            return true;
        }

        return false;
    }

    /*
     * parseEscape():
     *
     * Helper for parseTokens() AND parseCharacterClass().
     * Unlike the other parser methods, this function does not report tokens
     * directly to the member delegate (m_delegate), instead tokens are
     * emitted to the delegate provided as an argument.  In the case of atom
     * escapes, parseTokens() will call parseEscape() passing m_delegate as
     * an argument, and as such the escape will be reported to the delegate.
     *
     * However this method may also be used by parseCharacterClass(), in which
     * case a CharacterClassParserDelegate will be passed as the delegate that
     * tokens should be added to.  A boolean flag is also provided to indicate
     * whether that an escape in a CharacterClass is being parsed (some parsing
     * rules change in this context).
     *
     * The boolean value returned by this method indicates whether the token
     * parsed was an atom (outside of a characted class \b and \B will be
     * interpreted as assertions).
     */
    template<bool inCharacterClass, class EscapeDelegate>
    bool parseEscape(EscapeDelegate& delegate)
    {
        ASSERT(!hasError(m_errorCode));
        ASSERT(peek() == '\\');
        consume();

        if (atEndOfPattern()) {
            m_errorCode = ErrorCode::EscapeUnterminated;
            return false;
        }

        switch (peek()) {
        // Assertions
        case 'b':
            consume();
            if (inCharacterClass) {
                if (isIdentityEscapeAnError('b'))
                    break;

                delegate.atomPatternCharacter('\b');
            } else {
                delegate.assertionWordBoundary(false);
                return false;
            }
            break;
        case 'B':
            consume();
            if (inCharacterClass) {
                if (isIdentityEscapeAnError('B'))
                    break;

                delegate.atomPatternCharacter('B');
            } else {
                delegate.assertionWordBoundary(true);
                return false;
            }
            break;

        // CharacterClassEscape
        case 'd':
            consume();
            delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false);
            break;
        case 's':
            consume();
            delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false);
            break;
        case 'w':
            consume();
            delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false);
            break;
        case 'D':
            consume();
            delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true);
            break;
        case 'S':
            consume();
            delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true);
            break;
        case 'W':
            consume();
            delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true);
            break;

        // DecimalEscape
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9': {
            // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
            // First, try to parse this as backreference.
            if (!inCharacterClass) {
                ParseState state = saveState();

                unsigned backReference = consumeNumber();
                if (backReference <= m_backReferenceLimit) {
                    delegate.atomBackReference(backReference);
                    break;
                }

                restoreState(state);

                if (m_isUnicode) {
                    m_errorCode = ErrorCode::InvalidBackreference;
                    return false;
                }
            }

            // Not a backreference, and not octal. Just a number.
            if (peek() >= '8') {
                delegate.atomPatternCharacter(consume());
                break;
            }

            // Fall-through to handle this as an octal escape.
            FALLTHROUGH;
        }

        // Octal escape
        case '0':
            delegate.atomPatternCharacter(consumeOctal());
            break;

        // ControlEscape
        case 'f':
            consume();
            delegate.atomPatternCharacter('\f');
            break;
        case 'n':
            consume();
            delegate.atomPatternCharacter('\n');
            break;
        case 'r':
            consume();
            delegate.atomPatternCharacter('\r');
            break;
        case 't':
            consume();
            delegate.atomPatternCharacter('\t');
            break;
        case 'v':
            consume();
            delegate.atomPatternCharacter('\v');
            break;

        // ControlLetter
        case 'c': {
            ParseState state = saveState();
            consume();
            if (!atEndOfPattern()) {
                int control = consume();

                // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
                if (inCharacterClass ? WTF::isASCIIAlphanumeric(control) || (control == '_') : WTF::isASCIIAlpha(control)) {
                    delegate.atomPatternCharacter(control & 0x1f);
                    break;
                }
            }
            restoreState(state);
            delegate.atomPatternCharacter('\\');
            break;
        }

        // HexEscape
        case 'x': {
            consume();
            int x = tryConsumeHex(2);
            if (x == -1) {
                if (isIdentityEscapeAnError('x'))
                    break;

                delegate.atomPatternCharacter('x');
            } else
                delegate.atomPatternCharacter(x);
            break;
        }

        // Named backreference
        case 'k': {
            consume();
            ParseState state = saveState();
            if (!atEndOfPattern() && !inCharacterClass) {
                if (consume() == '<') {
                    auto groupName = tryConsumeGroupName();
                    if (groupName) {
                        if (m_captureGroupNames.contains(groupName.value())) {
                            delegate.atomNamedBackReference(groupName.value());
                            break;
                        }
                        
                        if (delegate.isValidNamedForwardReference(groupName.value())) {
                            delegate.atomNamedForwardReference(groupName.value());
                            break;
                        }
                    }
                    if (m_isUnicode) {
                        m_errorCode = ErrorCode::InvalidBackreference;
                        break;
                    }
                }
            }
            restoreState(state);
            delegate.atomPatternCharacter('k');
            break;
        }

        // Unicode property escapes
        case 'p':
        case 'P': {
            int escapeChar = consume();

            if (!m_isUnicode) {
                if (isIdentityEscapeAnError(escapeChar))
                    break;
                delegate.atomPatternCharacter(escapeChar);
                break;
            }

            if (!atEndOfPattern() && peek() == '{') {
                consume();
                auto optClassID = tryConsumeUnicodePropertyExpression();
                if (!optClassID) {
                    // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression
                    break;
                }
                delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P');
            } else
                m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
            break;
        }

        // UnicodeEscape
        case 'u': {
            consume();
            if (atEndOfPattern()) {
                if (isIdentityEscapeAnError('u'))
                    break;

                delegate.atomPatternCharacter('u');
                break;
            }

            if (m_isUnicode && peek() == '{') {
                consume();
                UChar32 codePoint = 0;
                do {
                    if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
                        m_errorCode = ErrorCode::InvalidUnicodeEscape;
                        break;
                    }

                    codePoint = (codePoint << 4) | toASCIIHexValue(consume());

                    if (codePoint > UCHAR_MAX_VALUE)
                        m_errorCode = ErrorCode::InvalidUnicodeEscape;
                } while (!atEndOfPattern() && peek() != '}');
                if (!atEndOfPattern() && peek() == '}')
                    consume();
                else if (!hasError(m_errorCode))
                    m_errorCode = ErrorCode::InvalidUnicodeEscape;
                if (hasError(m_errorCode))
                    return false;

                delegate.atomPatternCharacter(codePoint);
                break;
            }
            int u = tryConsumeHex(4);
            if (u == -1) {
                if (isIdentityEscapeAnError('u'))
                    break;

                delegate.atomPatternCharacter('u');
            } else {
                // If we have the first of a surrogate pair, look for the second.
                if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
                    ParseState state = saveState();
                    consume();
                    
                    if (tryConsume('u')) {
                        int surrogate2 = tryConsumeHex(4);
                        if (U16_IS_TRAIL(surrogate2)) {
                            u = U16_GET_SUPPLEMENTARY(u, surrogate2);
                            delegate.atomPatternCharacter(u);
                            break;
                        }
                    }

                    restoreState(state);
                }
                delegate.atomPatternCharacter(u);
            }
            break;
        }

        // IdentityEscape
        default:
            int ch = peek();

            if (ch == '-' && m_isUnicode && inCharacterClass) {
                // \- is allowed for ClassEscape with unicode flag.
                delegate.atomPatternCharacter(consume());
                break;
            }

            if (isIdentityEscapeAnError(ch))
                break;

            delegate.atomPatternCharacter(consume());
        }
        
        return true;
    }

    UChar32 consumePossibleSurrogatePair()
    {
        UChar32 ch = consume();
        if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) {
            ParseState state = saveState();

            UChar32 surrogate2 = consume();
            if (U16_IS_TRAIL(surrogate2))
                ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
            else
                restoreState(state);
        }

        return ch;
    }

    /*
     * parseAtomEscape(), parseCharacterClassEscape():
     *
     * These methods alias to parseEscape().
     */
    bool parseAtomEscape()
    {
        return parseEscape<false>(m_delegate);
    }
    void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
    {
        parseEscape<true>(delegate);
    }

    /*
     * parseCharacterClass():
     *
     * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
     * to an instance of CharacterClassParserDelegate, to describe the character class to the
     * delegate.
     */
    void parseCharacterClass()
    {
        ASSERT(!hasError(m_errorCode));
        ASSERT(peek() == '[');
        consume();

        CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode);

        characterClassConstructor.begin(tryConsume('^'));

        while (!atEndOfPattern()) {
            switch (peek()) {
            case ']':
                consume();
                characterClassConstructor.end();
                return;

            case '\\':
                parseCharacterClassEscape(characterClassConstructor);
                break;

            default:
                characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true);
            }

            if (hasError(m_errorCode))
                return;
        }

        m_errorCode = ErrorCode::CharacterClassUnmatched;
    }

    /*
     * parseParenthesesBegin():
     *
     * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
     */
    void parseParenthesesBegin()
    {
        ASSERT(!hasError(m_errorCode));
        ASSERT(peek() == '(');
        consume();

        if (tryConsume('?')) {
            if (atEndOfPattern()) {
                m_errorCode = ErrorCode::ParenthesesTypeInvalid;
                return;
            }

            switch (consume()) {
            case ':':
                m_delegate.atomParenthesesSubpatternBegin(false);
                break;
            
            case '=':
                m_delegate.atomParentheticalAssertionBegin();
                break;

            case '!':
                m_delegate.atomParentheticalAssertionBegin(true);
                break;

            case '<': {
                auto groupName = tryConsumeGroupName();
                if (groupName) {
                    auto setAddResult = m_captureGroupNames.add(groupName.value());
                    if (setAddResult.isNewEntry)
                        m_delegate.atomParenthesesSubpatternBegin(true, groupName);
                    else
                        m_errorCode = ErrorCode::DuplicateGroupName;
                } else
                    m_errorCode = ErrorCode::InvalidGroupName;

                break;
            }

            default:
                m_errorCode = ErrorCode::ParenthesesTypeInvalid;
            }
        } else
            m_delegate.atomParenthesesSubpatternBegin();

        ++m_parenthesesNestingDepth;
    }

    /*
     * parseParenthesesEnd():
     *
     * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
     */
    void parseParenthesesEnd()
    {
        ASSERT(!hasError(m_errorCode));
        ASSERT(peek() == ')');
        consume();

        if (m_parenthesesNestingDepth > 0)
            m_delegate.atomParenthesesEnd();
        else
            m_errorCode = ErrorCode::ParenthesesUnmatched;

        --m_parenthesesNestingDepth;
    }

    /*
     * parseQuantifier():
     *
     * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
     */
    void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
    {
        ASSERT(!hasError(m_errorCode));
        ASSERT(min <= max);

        if (min == UINT_MAX) {
            m_errorCode = ErrorCode::QuantifierTooLarge;
            return;
        }

        if (lastTokenWasAnAtom)
            m_delegate.quantifyAtom(min, max, !tryConsume('?'));
        else
            m_errorCode = ErrorCode::QuantifierWithoutAtom;
    }

    /*
     * parseTokens():
     *
     * This method loops over the input pattern reporting tokens to the delegate.
     * The method returns when a parse error is detected, or the end of the pattern
     * is reached.  One piece of state is tracked around the loop, which is whether
     * the last token passed to the delegate was an atom (this is necessary to detect
     * a parse error when a quantifier provided without an atom to quantify).
     */
    void parseTokens()
    {
        bool lastTokenWasAnAtom = false;

        while (!atEndOfPattern()) {
            switch (peek()) {
            case '|':
                consume();
                m_delegate.disjunction();
                lastTokenWasAnAtom = false;
                break;

            case '(':
                parseParenthesesBegin();
                lastTokenWasAnAtom = false;
                break;

            case ')':
                parseParenthesesEnd();
                lastTokenWasAnAtom = true;
                break;

            case '^':
                consume();
                m_delegate.assertionBOL();
                lastTokenWasAnAtom = false;
                break;

            case '$':
                consume();
                m_delegate.assertionEOL();
                lastTokenWasAnAtom = false;
                break;

            case '.':
                consume();
                m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false);
                lastTokenWasAnAtom = true;
                break;

            case '[':
                parseCharacterClass();
                lastTokenWasAnAtom = true;
                break;

            case '\\':
                lastTokenWasAnAtom = parseAtomEscape();
                break;

            case '*':
                consume();
                parseQuantifier(lastTokenWasAnAtom, 0, quantifyInfinite);
                lastTokenWasAnAtom = false;
                break;

            case '+':
                consume();
                parseQuantifier(lastTokenWasAnAtom, 1, quantifyInfinite);
                lastTokenWasAnAtom = false;
                break;

            case '?':
                consume();
                parseQuantifier(lastTokenWasAnAtom, 0, 1);
                lastTokenWasAnAtom = false;
                break;

            case '{': {
                ParseState state = saveState();

                consume();
                if (peekIsDigit()) {
                    unsigned min = consumeNumber();
                    unsigned max = min;
                    
                    if (tryConsume(','))
                        max = peekIsDigit() ? consumeNumber() : quantifyInfinite;

                    if (tryConsume('}')) {
                        if (min <= max)
                            parseQuantifier(lastTokenWasAnAtom, min, max);
                        else
                            m_errorCode = ErrorCode::QuantifierOutOfOrder;
                        lastTokenWasAnAtom = false;
                        break;
                    }
                }

                restoreState(state);
            }
            // if we did not find a complete quantifer, fall through to the default case.
            FALLTHROUGH;

            default:
                m_delegate.atomPatternCharacter(consumePossibleSurrogatePair());
                lastTokenWasAnAtom = true;
            }

            if (hasError(m_errorCode))
                return;
        }

        if (m_parenthesesNestingDepth > 0)
            m_errorCode = ErrorCode::MissingParentheses;
    }

    /*
     * parse():
     *
     * This method calls parseTokens() to parse over the input and returns error code for a result.
     */
    ErrorCode parse()
    {
        if (m_size > MAX_PATTERN_SIZE)
            m_errorCode = ErrorCode::PatternTooLarge;
        else
            parseTokens();
        ASSERT(atEndOfPattern() || hasError(m_errorCode));
        
        return m_errorCode;
    }

    // Misc helper functions:

    typedef unsigned ParseState;
    
    ParseState saveState()
    {
        return m_index;
    }

    void restoreState(ParseState state)
    {
        m_index = state;
    }

    bool atEndOfPattern()
    {
        ASSERT(m_index <= m_size);
        return m_index == m_size;
    }

    unsigned patternRemaining()
    {
        ASSERT(m_index <= m_size);
        return m_size - m_index;
    }

    int peek()
    {
        ASSERT(m_index < m_size);
        return m_data[m_index];
    }

    bool peekIsDigit()
    {
        return !atEndOfPattern() && WTF::isASCIIDigit(peek());
    }

    unsigned peekDigit()
    {
        ASSERT(peekIsDigit());
        return peek() - '0';
    }

    int tryConsumeUnicodeEscape()
    {
        if (!tryConsume('u'))
            return -1;

        if (m_isUnicode && tryConsume('{')) {
            int codePoint = 0;
            do {
                if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
                    m_errorCode = ErrorCode::InvalidUnicodeEscape;
                    return -1;
                }

                codePoint = (codePoint << 4) | toASCIIHexValue(consume());

                if (codePoint > UCHAR_MAX_VALUE) {
                    m_errorCode = ErrorCode::InvalidUnicodeEscape;
                    return -1;
                }
            } while (!atEndOfPattern() && peek() != '}');
            if (!atEndOfPattern() && peek() == '}')
                consume();
            else if (!hasError(m_errorCode))
                m_errorCode = ErrorCode::InvalidUnicodeEscape;
            if (hasError(m_errorCode))
                return -1;

            return codePoint;
        }

        int u = tryConsumeHex(4);
        if (u == -1)
            return -1;

        // If we have the first of a surrogate pair, look for the second.
        if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
            ParseState state = saveState();
            consume();

            if (tryConsume('u')) {
                int surrogate2 = tryConsumeHex(4);
                if (U16_IS_TRAIL(surrogate2)) {
                    u = U16_GET_SUPPLEMENTARY(u, surrogate2);
                    return u;
                }
            }

            restoreState(state);
        }

        return u;
    }

    int tryConsumeIdentifierCharacter()
    {
        int ch = peek();

        if (ch == '\\') {
            consume();
            ch = tryConsumeUnicodeEscape();
        } else
            consume();

        return ch;
    }

    bool isIdentifierStart(int ch)
    {
        return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK);
    }

    bool isIdentifierPart(int ch)
    {
        return (WTF::isASCII(ch) && (WTF::isASCIIAlpha(ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D;
    }

    bool isUnicodePropertyValueExpressionChar(int ch)
    {
        return WTF::isASCIIAlphanumeric(ch) || ch == '_' || ch == '=';
    }

    int consume()
    {
        ASSERT(m_index < m_size);
        return m_data[m_index++];
    }

    unsigned consumeDigit()
    {
        ASSERT(peekIsDigit());
        return consume() - '0';
    }

    unsigned consumeNumber()
    {
        Checked<unsigned, RecordOverflow> n = consumeDigit();
        while (peekIsDigit())
            n = n * 10 + consumeDigit();
        return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet();
    }

    unsigned consumeOctal()
    {
        ASSERT(WTF::isASCIIOctalDigit(peek()));

        unsigned n = consumeDigit();
        while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
            n = n * 8 + consumeDigit();
        return n;
    }

    bool tryConsume(UChar ch)
    {
        if (atEndOfPattern() || (m_data[m_index] != ch))
            return false;
        ++m_index;
        return true;
    }

    int tryConsumeHex(int count)
    {
        ParseState state = saveState();

        int n = 0;
        while (count--) {
            if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
                restoreState(state);
                return -1;
            }
            n = (n << 4) | WTF::toASCIIHexValue(consume());
        }
        return n;
    }

    Optional<String> tryConsumeGroupName()
    {
        if (atEndOfPattern())
            return WTF::nullopt;

        ParseState state = saveState();
        
        int ch = tryConsumeIdentifierCharacter();

        if (isIdentifierStart(ch)) {
            StringBuilder identifierBuilder;
            identifierBuilder.append(ch);

            while (!atEndOfPattern()) {
                ch = tryConsumeIdentifierCharacter();
                if (ch == '>')
                    return Optional<String>(identifierBuilder.toString());

                if (!isIdentifierPart(ch))
                    break;

                identifierBuilder.append(ch);
            }
        }

        restoreState(state);

        return WTF::nullopt;
    }

    Optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression()
    {
        if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(peek())) {
            m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
            return WTF::nullopt;
        }

        StringBuilder expressionBuilder;
        String unicodePropertyName;
        bool foundEquals = false;
        unsigned errors = 0;

        expressionBuilder.append(consume());

        while (!atEndOfPattern()) {
            int ch = peek();
            if (ch == '}') {
                consume();
                if (errors) {
                    m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
                    return WTF::nullopt;
                }

                if (foundEquals) {
                    auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString());
                    if (!result)
                        m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
                    return result;
                }

                auto result = unicodeMatchProperty(expressionBuilder.toString());
                if (!result)
                    m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
                return result;
            }

            consume();
            if (ch == '=') {
                if (!foundEquals) {
                    foundEquals = true;
                    unicodePropertyName = expressionBuilder.toString();
                    expressionBuilder.clear();
                } else
                    errors++;
            } else if (!isUnicodePropertyValueExpressionChar(ch))
                errors++;
            else
                expressionBuilder.append(ch);
        }

        m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
        return WTF::nullopt;
    }

    Delegate& m_delegate;
    unsigned m_backReferenceLimit;
    ErrorCode m_errorCode { ErrorCode::NoError };
    const CharType* m_data;
    unsigned m_size;
    unsigned m_index { 0 };
    bool m_isUnicode;
    unsigned m_parenthesesNestingDepth { 0 };
    HashSet<String> m_captureGroupNames;

    // Derived by empirical testing of compile time in PCRE and WREC.
    static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
};

/*
 * Yarr::parse():
 *
 * The parse method is passed a pattern to be parsed and a delegate upon which
 * callbacks will be made to record the parsed tokens forming the regex.
 * Yarr::parse() returns null on success, or a const C string providing an error
 * message where a parse error occurs.
 *
 * The Delegate must implement the following interface:
 *
 *    void assertionBOL();
 *    void assertionEOL();
 *    void assertionWordBoundary(bool invert);
 *
 *    void atomPatternCharacter(UChar32 ch);
 *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
 *    void atomCharacterClassBegin(bool invert)
 *    void atomCharacterClassAtom(UChar32 ch)
 *    void atomCharacterClassRange(UChar32 begin, UChar32 end)
 *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
 *    void atomCharacterClassEnd()
 *    void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName);
 *    void atomParentheticalAssertionBegin(bool invert = false);
 *    void atomParenthesesEnd();
 *    void atomBackReference(unsigned subpatternId);
 *    void atomNamedBackReference(const String& subpatternName);
 *    bool isValidNamedForwardReference(const String& subpatternName);
 *    void atomNamedForwardReference(const String& subpatternName);
 *
 *    void quantifyAtom(unsigned min, unsigned max, bool greedy);
 *
 *    void disjunction();
 *
 * The regular expression is described by a sequence of assertion*() and atom*()
 * callbacks to the delegate, describing the terms in the regular expression.
 * Following an atom a quantifyAtom() call may occur to indicate that the previous
 * atom should be quantified.  In the case of atoms described across multiple
 * calls (parentheses and character classes) the call to quantifyAtom() will come
 * after the call to the atom*End() method, never after atom*Begin().
 *
 * Character classes may either be described by a single call to
 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
 * In the latter case, ...Begin() will be called, followed by a sequence of
 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
 *
 * Sequences of atoms and assertions are broken into alternatives via calls to
 * disjunction().  Assertions, atoms, and disjunctions emitted between calls to
 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
 * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular
 * capturing subpattern, this will be the subpatternId associated with these
 * parentheses, and will also by definition be the lowest subpatternId of these
 * parentheses and of any nested paretheses.  The atomParenthesesEnd() method
 * is passed the subpatternId of the last capturing subexpression nested within
 * these paretheses.  In the case of a capturing subpattern with no nested
 * capturing subpatterns, the same subpatternId will be passed to the begin and
 * end functions.  In the case of non-capturing subpatterns the subpatternId
 * passed to the begin method is also the first possible subpatternId that might
 * be nested within these paretheses.  If a set of non-capturing parentheses does
 * not contain any capturing subpatterns, then the subpatternId passed to begin
 * will be greater than the subpatternId passed to end.
 */

template<class Delegate>
ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite)
{
    if (pattern.is8Bit())
        return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
    return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
}

} } // namespace JSC::Yarr