URLFilterParser.cpp   [plain text]


/*
 * Copyright (C) 2014, 2015 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "URLFilterParser.h"

#if ENABLE(CONTENT_EXTENSIONS)

#include "CombinedURLFilters.h"
#include "Term.h"
#include <JavaScriptCore/YarrParser.h>
#include <wtf/Deque.h>
#include <wtf/text/CString.h>

namespace WebCore {

namespace ContentExtensions {

class PatternParser {
public:
    PatternParser(bool patternIsCaseSensitive)
        : m_patternIsCaseSensitive(patternIsCaseSensitive)
        , m_parseStatus(URLFilterParser::Ok)
    {
    }

    void finalize(uint64_t patternId, CombinedURLFilters& combinedURLFilters)
    {
        if (hasError())
            return;

        sinkFloatingTermIfNecessary();

        simplifySunkTerms();

        // Check to see if there are any terms without ? or *.
        bool matchesEverything = true;
        for (const auto& term : m_sunkTerms) {
            if (term.matchesAtLeastOneCharacter()) {
                matchesEverything = false;
                break;
            }
        }
        if (matchesEverything) {
            fail(URLFilterParser::MatchesEverything);
            return;
        }

        combinedURLFilters.addPattern(patternId, m_sunkTerms);
    }

    URLFilterParser::ParseStatus parseStatus() const
    {
        return m_parseStatus;
    }

    void atomPatternCharacter(UChar character)
    {
        if (hasError())
            return;

        if (!isASCII(character)) {
            fail(URLFilterParser::NonASCII);
            return;
        }

        sinkFloatingTermIfNecessary();
        ASSERT(!m_floatingTerm.isValid());

        char asciiChararacter = static_cast<char>(character);
        m_floatingTerm = Term(asciiChararacter, m_patternIsCaseSensitive);
    }

    void atomBuiltInCharacterClass(JSC::Yarr::BuiltInCharacterClassID builtInCharacterClassID, bool inverted)
    {
        if (hasError())
            return;

        sinkFloatingTermIfNecessary();
        ASSERT(!m_floatingTerm.isValid());

        if (builtInCharacterClassID == JSC::Yarr::NewlineClassID && inverted)
            m_floatingTerm = Term(Term::UniversalTransition);
        else
            fail(URLFilterParser::UnsupportedCharacterClass);
    }

    void quantifyAtom(unsigned minimum, unsigned maximum, bool)
    {
        if (hasError())
            return;

        ASSERT(m_floatingTerm.isValid());

        if (!minimum && maximum == 1)
            m_floatingTerm.quantify(AtomQuantifier::ZeroOrOne);
        else if (!minimum && maximum == JSC::Yarr::quantifyInfinite)
            m_floatingTerm.quantify(AtomQuantifier::ZeroOrMore);
        else if (minimum == 1 && maximum == JSC::Yarr::quantifyInfinite)
            m_floatingTerm.quantify(AtomQuantifier::OneOrMore);
        else
            fail(URLFilterParser::InvalidQuantifier);
    }

    void atomBackReference(unsigned)
    {
        fail(URLFilterParser::BackReference);
    }

    void assertionBOL()
    {
        if (hasError())
            return;

        if (m_floatingTerm.isValid() || !m_sunkTerms.isEmpty() || !m_openGroups.isEmpty()) {
            fail(URLFilterParser::MisplacedStartOfLine);
            return;
        }

        m_hasBeginningOfLineAssertion = true;
    }

    void assertionEOL()
    {
        if (hasError())
            return;

        sinkFloatingTermIfNecessary();
        ASSERT(!m_floatingTerm.isValid());

        m_floatingTerm = Term(Term::EndOfLineAssertionTerm);
    }

    void assertionWordBoundary(bool)
    {
        fail(URLFilterParser::WordBoundary);
    }

    void atomCharacterClassBegin(bool inverted = false)
    {
        if (hasError())
            return;

        sinkFloatingTermIfNecessary();
        ASSERT(!m_floatingTerm.isValid());

        m_floatingTerm = Term(Term::CharacterSetTerm, inverted);
    }

    void atomCharacterClassAtom(UChar character)
    {
        if (hasError())
            return;

        ASSERT(isASCII(character));

        m_floatingTerm.addCharacter(character, m_patternIsCaseSensitive);
    }

    void atomCharacterClassRange(UChar a, UChar b)
    {
        if (hasError())
            return;

        ASSERT(a);
        ASSERT(b);
        ASSERT(isASCII(a));
        ASSERT(isASCII(b));

        for (unsigned i = a; i <= b; ++i)
            m_floatingTerm.addCharacter(static_cast<UChar>(i), m_patternIsCaseSensitive);
    }

    void atomCharacterClassEnd()
    {
        // Nothing to do here. The character set atom may have a quantifier, we sink the atom lazily.
    }

    void atomCharacterClassBuiltIn(JSC::Yarr::BuiltInCharacterClassID, bool)
    {
        fail(URLFilterParser::AtomCharacter);
    }

    void atomParenthesesSubpatternBegin(bool = true)
    {
        if (hasError())
            return;

        sinkFloatingTermIfNecessary();

        m_openGroups.append(Term(Term::GroupTerm));
    }

    void atomParentheticalAssertionBegin(bool = false)
    {
        fail(URLFilterParser::Group);
    }

    void atomParenthesesEnd()
    {
        if (hasError())
            return;

        sinkFloatingTermIfNecessary();
        ASSERT(!m_floatingTerm.isValid());

        m_floatingTerm = m_openGroups.takeLast();
    }

    void disjunction()
    {
        fail(URLFilterParser::Disjunction);
    }

private:
    bool hasError() const
    {
        return m_parseStatus != URLFilterParser::Ok;
    }

    void fail(URLFilterParser::ParseStatus reason)
    {
        if (hasError())
            return;

        m_parseStatus = reason;
    }

    void sinkFloatingTermIfNecessary()
    {
        if (!m_floatingTerm.isValid())
            return;

        if (m_hasProcessedEndOfLineAssertion) {
            fail(URLFilterParser::MisplacedEndOfLine);
            m_floatingTerm = Term();
            return;
        }

        if (m_floatingTerm.isEndOfLineAssertion())
            m_hasProcessedEndOfLineAssertion = true;

        if (!m_openGroups.isEmpty()) {
            m_openGroups.last().extendGroupSubpattern(m_floatingTerm);
            m_floatingTerm = Term();
            return;
        }

        m_sunkTerms.append(m_floatingTerm);
        m_floatingTerm = Term();
    }

    void simplifySunkTerms()
    {
        ASSERT(!m_floatingTerm.isValid());

        if (m_sunkTerms.isEmpty())
            return;

        Term canonicalDotStar(Term::UniversalTransition);
        canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore);

        // Replace every ".*"-like terms by our canonical version. Remove any duplicate ".*".
        {
            unsigned termIndex = 0;
            bool isAfterDotStar = false;
            while (termIndex < m_sunkTerms.size()) {
                if (isAfterDotStar && m_sunkTerms[termIndex].isKnownToMatchAnyString()) {
                    m_sunkTerms.remove(termIndex);
                    continue;
                }
                isAfterDotStar = false;

                if (m_sunkTerms[termIndex].isKnownToMatchAnyString()) {
                    m_sunkTerms[termIndex] = canonicalDotStar;
                    isAfterDotStar = true;
                }
                ++termIndex;
            }
        }

        // Add our ".*" in front if needed.
        if (!m_hasBeginningOfLineAssertion && !m_sunkTerms.first().isKnownToMatchAnyString())
            m_sunkTerms.insert(0, canonicalDotStar);

        // Remove trailing ".*$".
        if (m_sunkTerms.size() > 2 && m_sunkTerms.last().isEndOfLineAssertion() && m_sunkTerms[m_sunkTerms.size() - 2].isKnownToMatchAnyString())
            m_sunkTerms.shrink(m_sunkTerms.size() - 2);

        // Remove irrelevant terms that can match empty. For example in "foob?", matching "b" is irrelevant.
        if (m_sunkTerms.last().isEndOfLineAssertion())
            return;
        while (!m_sunkTerms.isEmpty() && !m_sunkTerms.last().matchesAtLeastOneCharacter())
            m_sunkTerms.removeLast();
    }

    bool m_patternIsCaseSensitive;

    Deque<Term> m_openGroups;
    Vector<Term> m_sunkTerms;
    Term m_floatingTerm;
    bool m_hasBeginningOfLineAssertion { false };
    bool m_hasProcessedEndOfLineAssertion { false };

    URLFilterParser::ParseStatus m_parseStatus;
};

URLFilterParser::URLFilterParser(CombinedURLFilters& combinedURLFilters)
    : m_combinedURLFilters(combinedURLFilters)
{
}

URLFilterParser::~URLFilterParser()
{
}

URLFilterParser::ParseStatus URLFilterParser::addPattern(const String& pattern, bool patternIsCaseSensitive, uint64_t patternId)
{
    if (!pattern.containsOnlyASCII())
        return NonASCII;
    if (pattern.isEmpty())
        return EmptyPattern;

    ParseStatus status = Ok;
    PatternParser patternParser(patternIsCaseSensitive);
    String error = String(JSC::Yarr::parse(patternParser, pattern, 0));
    if (error.isNull())
        patternParser.finalize(patternId, m_combinedURLFilters);
    else
        status = YarrError;
    
    if (status == Ok)
        status = patternParser.parseStatus();

    return status;
}

String URLFilterParser::statusString(ParseStatus status)
{
    switch (status) {
    case Ok:
        return "Ok";
    case MatchesEverything:
        return "Matches everything.";
    case NonASCII:
        return "Only ASCII characters are supported in pattern.";
    case UnsupportedCharacterClass:
        return "Character class is not supported.";
    case BackReference:
        return "Patterns cannot contain backreferences.";
    case MisplacedStartOfLine:
        return "Start of line assertion can only appear as the first term in a filter.";
    case WordBoundary:
        return "Word boundaries assertions are not supported yet.";
    case AtomCharacter:
        return "Builtins character class atoms are not supported yet.";
    case Group:
        return "Groups are not supported yet.";
    case Disjunction:
        return "Disjunctions are not supported yet.";
    case MisplacedEndOfLine:
        return "The end of line assertion must be the last term in an expression.";
    case EmptyPattern:
        return "Empty pattern.";
    case YarrError:
        return "Internal error in YARR.";
    case InvalidQuantifier:
        return "Arbitrary atom repetitions are not supported.";
    }
}
    
} // namespace ContentExtensions
} // namespace WebCore

#endif // ENABLE(CONTENT_EXTENSIONS)