WRECParser.cpp   [plain text]


/*
 * Copyright (C) 2008 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
 */

#include "config.h"
#include "WRECParser.h"

#if ENABLE(WREC)

#include "CharacterClassConstructor.h"
#include "WRECFunctors.h"

using namespace WTF;

namespace JSC { namespace WREC {

// These error messages match the error messages used by PCRE.
const char* Parser::QuantifierOutOfOrder = "numbers out of order in {} quantifier";
const char* Parser::QuantifierWithoutAtom = "nothing to repeat";
const char* Parser::ParenthesesUnmatched = "unmatched parentheses";
const char* Parser::ParenthesesTypeInvalid = "unrecognized character after (?";
const char* Parser::ParenthesesNotSupported = ""; // Not a user-visible syntax error -- just signals a syntax that WREC doesn't support yet.
const char* Parser::CharacterClassUnmatched = "missing terminating ] for character class";
const char* Parser::CharacterClassOutOfOrder = "range out of order in character class";
const char* Parser::EscapeUnterminated = "\\ at end of pattern";

class PatternCharacterSequence {
typedef Generator::JumpList JumpList;

public:
    PatternCharacterSequence(Generator& generator, JumpList& failures)
        : m_generator(generator)
        , m_failures(failures)
    {
    }
    
    size_t size() { return m_sequence.size(); }
    
    void append(int ch)
    {
        m_sequence.append(ch);
    }
    
    void flush()
    {
        if (!m_sequence.size())
            return;

        m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size());
        m_sequence.clear();
    }

    void flush(const Quantifier& quantifier)
    {
        if (!m_sequence.size())
            return;

        m_generator.generatePatternCharacterSequence(m_failures, m_sequence.begin(), m_sequence.size() - 1);

        switch (quantifier.type) {
        case Quantifier::None:
        case Quantifier::Error:
            ASSERT_NOT_REACHED();
            break;

        case Quantifier::Greedy: {
            GeneratePatternCharacterFunctor functor(m_sequence.last());
            m_generator.generateGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
            break;
        }
        
        case Quantifier::NonGreedy: {
            GeneratePatternCharacterFunctor functor(m_sequence.last());
            m_generator.generateNonGreedyQuantifier(m_failures, functor, quantifier.min, quantifier.max);
            break;
        }
        }
        
        m_sequence.clear();
    }

private:
    Generator& m_generator;
    JumpList& m_failures;
    Vector<int, 8> m_sequence;
};

ALWAYS_INLINE Quantifier Parser::consumeGreedyQuantifier()
{
    switch (peek()) {
        case '?':
            consume();
            return Quantifier(Quantifier::Greedy, 0, 1);

        case '*':
            consume();
            return Quantifier(Quantifier::Greedy, 0);

        case '+':
            consume();
            return Quantifier(Quantifier::Greedy, 1);

        case '{': {
            SavedState state(*this);
            consume();

            // Accept: {n}, {n,}, {n,m}.
            // Reject: {n,m} where n > m.
            // Ignore: Anything else, such as {n, m}.

            if (!peekIsDigit()) {
                state.restore();
                return Quantifier();
            }

            unsigned min = consumeNumber();
            unsigned max = min;

            if (peek() == ',') {
                consume();
                max = peekIsDigit() ? consumeNumber() : Quantifier::Infinity;
            }

            if (peek() != '}') {
                state.restore();
                return Quantifier();
            }
            consume();
 
            if (min > max) {
                setError(QuantifierOutOfOrder);
                return Quantifier(Quantifier::Error);
            }

            return Quantifier(Quantifier::Greedy, min, max);
         }

         default:
            return Quantifier(); // No quantifier.
    }
}

Quantifier Parser::consumeQuantifier()
{
    Quantifier q = consumeGreedyQuantifier();
    
    if ((q.type == Quantifier::Greedy) && (peek() == '?')) {
        consume();
        q.type = Quantifier::NonGreedy;
    }
    
    return q;
}

bool Parser::parseCharacterClassQuantifier(JumpList& failures, const CharacterClass& charClass, bool invert)
{
    Quantifier q = consumeQuantifier();

    switch (q.type) {
    case Quantifier::None: {
        m_generator.generateCharacterClass(failures, charClass, invert);
        break;
    }

    case Quantifier::Greedy: {
        GenerateCharacterClassFunctor functor(&charClass, invert);
        m_generator.generateGreedyQuantifier(failures, functor, q.min, q.max);
        break;
    }

    case Quantifier::NonGreedy: {
        GenerateCharacterClassFunctor functor(&charClass, invert);
        m_generator.generateNonGreedyQuantifier(failures, functor, q.min, q.max);
        break;
    }

    case Quantifier::Error:
        return false;
    }
    
    return true;
}

bool Parser::parseBackreferenceQuantifier(JumpList& failures, unsigned subpatternId)
{
    Quantifier q = consumeQuantifier();

    switch (q.type) {
    case Quantifier::None: {
        m_generator.generateBackreference(failures, subpatternId);
        break;
    }

    case Quantifier::Greedy:
    case Quantifier::NonGreedy:
        m_generator.generateBackreferenceQuantifier(failures, q.type, subpatternId, q.min, q.max);
        return true;

    case Quantifier::Error:
        return false;
    }
    
    return true;
}

bool Parser::parseParentheses(JumpList& failures)
{
    ParenthesesType type = consumeParenthesesType();

    // FIXME: WREC originally failed to backtrack correctly in cases such as
    // "c".match(/(.*)c/). Now, most parentheses handling is disabled. For
    // unsupported parentheses, we fall back on PCRE.

    switch (type) {
        case Generator::Assertion: {
            m_generator.generateParenthesesAssertion(failures);

            if (consume() != ')') {
                setError(ParenthesesUnmatched);
                return false;
            }

            Quantifier quantifier = consumeQuantifier();
            if (quantifier.type != Quantifier::None && quantifier.min == 0) {
                setError(ParenthesesNotSupported);
                return false;
            }

            return true;
        }
        case Generator::InvertedAssertion: {
            m_generator.generateParenthesesInvertedAssertion(failures);

            if (consume() != ')') {
                setError(ParenthesesUnmatched);
                return false;
            }

            Quantifier quantifier = consumeQuantifier();
            if (quantifier.type != Quantifier::None && quantifier.min == 0) {
                setError(ParenthesesNotSupported);
                return false;
            }

            return true;
        }
        default:
            setError(ParenthesesNotSupported);
            return false;
    }
}

bool Parser::parseCharacterClass(JumpList& failures)
{
    bool invert = false;
    if (peek() == '^') {
        consume();
        invert = true;
    }

    CharacterClassConstructor constructor(m_ignoreCase);

    int ch;
    while ((ch = peek()) != ']') {
        switch (ch) {
        case EndOfPattern:
            setError(CharacterClassUnmatched);
            return false;

        case '\\': {
            consume();
            Escape escape = consumeEscape(true);

            switch (escape.type()) {
                case Escape::PatternCharacter: {
                    int character = PatternCharacterEscape::cast(escape).character();
                    if (character == '-')
                        constructor.flushBeforeEscapedHyphen();
                    constructor.put(character);
                    break;
                }
                case Escape::CharacterClass: {
                    const CharacterClassEscape& characterClassEscape = CharacterClassEscape::cast(escape);
                    ASSERT(!characterClassEscape.invert());
                    constructor.append(characterClassEscape.characterClass());
                    break;
                }
                case Escape::Error:
                    return false;
                case Escape::Backreference:
                case Escape::WordBoundaryAssertion: {
                    ASSERT_NOT_REACHED();
                    break;
                }
            }
            break;
        }

        default:
            consume();
            constructor.put(ch);
        }
    }
    consume();

    // lazily catch reversed ranges ([z-a])in character classes
    if (constructor.isUpsideDown()) {
        setError(CharacterClassOutOfOrder);
        return false;
    }

    constructor.flush();
    CharacterClass charClass = constructor.charClass();
    return parseCharacterClassQuantifier(failures, charClass, invert);
}

bool Parser::parseNonCharacterEscape(JumpList& failures, const Escape& escape)
{
    switch (escape.type()) {
        case Escape::PatternCharacter:
            ASSERT_NOT_REACHED();
            return false;

        case Escape::CharacterClass:
            return parseCharacterClassQuantifier(failures, CharacterClassEscape::cast(escape).characterClass(), CharacterClassEscape::cast(escape).invert());

        case Escape::Backreference:
            return parseBackreferenceQuantifier(failures, BackreferenceEscape::cast(escape).subpatternId());

        case Escape::WordBoundaryAssertion:
            m_generator.generateAssertionWordBoundary(failures, WordBoundaryAssertionEscape::cast(escape).invert());
            return true;

        case Escape::Error:
            return false;
    }

    ASSERT_NOT_REACHED();
    return false;
}

Escape Parser::consumeEscape(bool inCharacterClass)
{
    switch (peek()) {
    case EndOfPattern:
        setError(EscapeUnterminated);
        return Escape(Escape::Error);

    // Assertions
    case 'b':
        consume();
        if (inCharacterClass)
            return PatternCharacterEscape('\b');
        return WordBoundaryAssertionEscape(false); // do not invert
    case 'B':
        consume();
        if (inCharacterClass)
            return PatternCharacterEscape('B');
        return WordBoundaryAssertionEscape(true); // invert

    // CharacterClassEscape
    case 'd':
        consume();
        return CharacterClassEscape(CharacterClass::digits(), false);
    case 's':
        consume();
        return CharacterClassEscape(CharacterClass::spaces(), false);
    case 'w':
        consume();
        return CharacterClassEscape(CharacterClass::wordchar(), false);
    case 'D':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nondigits(), false)
            : CharacterClassEscape(CharacterClass::digits(), true);
    case 'S':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nonspaces(), false)
            : CharacterClassEscape(CharacterClass::spaces(), true);
    case 'W':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
            : CharacterClassEscape(CharacterClass::wordchar(), true);

    // DecimalEscape
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9': {
        if (peekDigit() > m_numSubpatterns || inCharacterClass) {
            // To match Firefox, we parse an invalid backreference in the range [1-7]
            // as an octal escape.
            return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
        }

        int value = 0;
        do {
            unsigned newValue = value * 10 + peekDigit();
            if (newValue > m_numSubpatterns)
                break;
            value = newValue;
            consume();
        } while (peekIsDigit());

        return BackreferenceEscape(value);
    }

    // Octal escape
    case '0':
        consume();
        return PatternCharacterEscape(consumeOctal());

    // ControlEscape
    case 'f':
        consume();
        return PatternCharacterEscape('\f');
    case 'n':
        consume();
        return PatternCharacterEscape('\n');
    case 'r':
        consume();
        return PatternCharacterEscape('\r');
    case 't':
        consume();
        return PatternCharacterEscape('\t');
    case 'v':
        consume();
        return PatternCharacterEscape('\v');

    // ControlLetter
    case 'c': {
        SavedState state(*this);
        consume();
        
        int control = consume();
        // To match Firefox, inside a character class, we also accept numbers
        // and '_' as control characters.
        if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) {
            state.restore();
            return PatternCharacterEscape('\\');
        }
        return PatternCharacterEscape(control & 31);
    }

    // HexEscape
    case 'x': {
        consume();

        SavedState state(*this);
        int x = consumeHex(2);
        if (x == -1) {
            state.restore();
            return PatternCharacterEscape('x');
        }
        return PatternCharacterEscape(x);
    }

    // UnicodeEscape
    case 'u': {
        consume();

        SavedState state(*this);
        int x = consumeHex(4);
        if (x == -1) {
            state.restore();
            return PatternCharacterEscape('u');
        }
        return PatternCharacterEscape(x);
    }

    // IdentityEscape
    default:
        return PatternCharacterEscape(consume());
    }
}

void Parser::parseAlternative(JumpList& failures)
{
    PatternCharacterSequence sequence(m_generator, failures);

    while (1) {
        switch (peek()) {
        case EndOfPattern:
        case '|':
        case ')':
            sequence.flush();
            return;

        case '*':
        case '+':
        case '?':
        case '{': {
            Quantifier q = consumeQuantifier();

            if (q.type == Quantifier::None) {
                sequence.append(consume());
                continue;
            }

            if (q.type == Quantifier::Error)
                return;

            if (!sequence.size()) {
                setError(QuantifierWithoutAtom);
                return;
            }

            sequence.flush(q);
            continue;
        }

        case '^':
            consume();

            sequence.flush();
            m_generator.generateAssertionBOL(failures);
            continue;

        case '$':
            consume();

            sequence.flush();
            m_generator.generateAssertionEOL(failures);
            continue;

        case '.':
            consume();

            sequence.flush();
            if (!parseCharacterClassQuantifier(failures, CharacterClass::newline(), true))
                return;
            continue;

        case '[':
            consume();

            sequence.flush();
            if (!parseCharacterClass(failures))
                return;
            continue;

        case '(':
            consume();

            sequence.flush();
            if (!parseParentheses(failures))
                return;
            continue;

        case '\\': {
            consume();

            Escape escape = consumeEscape(false);
            if (escape.type() == Escape::PatternCharacter) {
                sequence.append(PatternCharacterEscape::cast(escape).character());
                continue;
            }

            sequence.flush();
            if (!parseNonCharacterEscape(failures, escape))
                return;
            continue;
        }

        default:
            sequence.append(consume());
            continue;
        }
    }
}

/*
  TOS holds index.
*/
void Parser::parseDisjunction(JumpList& failures)
{
    parseAlternative(failures);
    if (peek() != '|')
        return;

    JumpList successes;
    do {
        consume();
        m_generator.terminateAlternative(successes, failures);
        parseAlternative(failures);
    } while (peek() == '|');

    m_generator.terminateDisjunction(successes);
}

Generator::ParenthesesType Parser::consumeParenthesesType()
{
    if (peek() != '?')
        return Generator::Capturing;
    consume();

    switch (consume()) {
    case ':':
        return Generator::NonCapturing;
    
    case '=':
        return Generator::Assertion;

    case '!':
        return Generator::InvertedAssertion;

    default:
        setError(ParenthesesTypeInvalid);
        return Generator::Error;
    }
}

} } // namespace JSC::WREC

#endif // ENABLE(WREC)