Term.h   [plain text]


/*
 * Copyright (C) 2014-2015 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

#pragma once

#if ENABLE(CONTENT_EXTENSIONS)

#include "ContentExtensionsDebugging.h"
#include "NFA.h"
#include <unicode/utypes.h>
#include <wtf/ASCIICType.h>
#include <wtf/HashMap.h>
#include <wtf/Vector.h>
#include <wtf/text/StringBuilder.h>
#include <wtf/text/WTFString.h>

namespace WebCore {

namespace ContentExtensions {

enum class AtomQuantifier : uint8_t {
    One,
    ZeroOrOne,
    ZeroOrMore,
    OneOrMore
};

class Term {
public:
    Term();
    Term(char character, bool isCaseSensitive);

    enum UniversalTransitionTag { UniversalTransition };
    explicit Term(UniversalTransitionTag);

    enum CharacterSetTermTag { CharacterSetTerm };
    explicit Term(CharacterSetTermTag, bool isInverted);

    enum GroupTermTag { GroupTerm };
    explicit Term(GroupTermTag);

    enum EndOfLineAssertionTermTag { EndOfLineAssertionTerm };
    explicit Term(EndOfLineAssertionTermTag);

    Term(const Term& other);
    Term(Term&& other);

    enum EmptyValueTag { EmptyValue };
    Term(EmptyValueTag);

    ~Term();

    bool isValid() const;

    // CharacterSet terms only.
    void addCharacter(UChar character, bool isCaseSensitive);

    // Group terms only.
    void extendGroupSubpattern(const Term&);

    void quantify(const AtomQuantifier&);

    ImmutableCharNFANodeBuilder generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const;
    void generateGraph(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;

    bool isEndOfLineAssertion() const;

    bool matchesAtLeastOneCharacter() const;

    // Matches any string, the empty string included.
    // This is very conservative. Patterns matching any string can return false here.
    bool isKnownToMatchAnyString() const;

    // Return true if the term can only match input of a single fixed length.
    bool hasFixedLength() const;

    Term& operator=(const Term& other);
    Term& operator=(Term&& other);

    bool operator==(const Term& other) const;

    unsigned hash() const;

    bool isEmptyValue() const;

#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
    String toString() const;
#endif

#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
    size_t memoryUsed() const;
#endif
    
private:
    // This is exact for character sets but conservative for groups.
    // The return value can be false for a group equivalent to a universal transition.
    bool isUniversalTransition() const;

    void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const;
    void generateSubgraphForAtom(NFA&, ImmutableCharNFANodeBuilder& source, uint32_t destination) const;

    void destroy();

    enum class TermType : uint8_t {
        Empty,
        CharacterSet,
        Group
    };

    TermType m_termType { TermType::Empty };
    AtomQuantifier m_quantifier { AtomQuantifier::One };

    class CharacterSet {
    public:
        void set(UChar character)
        {
            RELEASE_ASSERT(character < 128);
            m_characters[character / 64] |= (uint64_t(1) << (character % 64));
        }
        
        bool get(UChar character) const
        {
            RELEASE_ASSERT(character < 128);
            return m_characters[character / 64] & (uint64_t(1) << (character % 64));
        }
        
        void invert()
        {
            ASSERT(!m_inverted);
            m_inverted = true;
        }
        
        bool inverted() const { return m_inverted; }
        
        unsigned bitCount() const
        {
            return WTF::bitCount(m_characters[0]) + WTF::bitCount(m_characters[1]);
        }
        
        bool operator==(const CharacterSet& other) const
        {
            return other.m_inverted == m_inverted
                && other.m_characters[0] == m_characters[0]
                && other.m_characters[1] == m_characters[1];
        }

        unsigned hash() const
        {
            return WTF::pairIntHash(WTF::pairIntHash(WTF::intHash(m_characters[0]), WTF::intHash(m_characters[1])), m_inverted);
        }
    private:
        bool m_inverted { false };
        uint64_t m_characters[2] { 0, 0 };
    };

    struct Group {
        Vector<Term> terms;

        bool operator==(const Group& other) const
        {
            return other.terms == terms;
        }

        unsigned hash() const
        {
            unsigned hash = 6421749;
            for (const Term& term : terms) {
                unsigned termHash = term.hash();
                hash = (hash << 16) ^ ((termHash << 11) ^ hash);
                hash += hash >> 11;
            }
            return hash;
        }
    };

    union AtomData {
        AtomData()
            : invalidTerm(0)
        {
        }
        ~AtomData()
        {
        }

        char invalidTerm;
        CharacterSet characterSet;
        Group group;
    } m_atomData;
};

#if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING
inline String quantifierToString(AtomQuantifier quantifier)
{
    switch (quantifier) {
    case AtomQuantifier::One:
        return "";
    case AtomQuantifier::ZeroOrOne:
        return "?";
    case AtomQuantifier::ZeroOrMore:
        return "*";
    case AtomQuantifier::OneOrMore:
        return "+";
    }
}
    
inline String Term::toString() const
{
    switch (m_termType) {
    case TermType::Empty:
        return "(Empty)";
    case TermType::CharacterSet: {
        StringBuilder builder;
        builder.append('[');
        for (UChar c = 0; c < 128; c++) {
            if (m_atomData.characterSet.get(c)) {
                if (isASCIIPrintable(c) && !isASCIISpace(c))
                    builder.append(c);
                else {
                    builder.append('\\');
                    builder.append('u');
                    builder.appendNumber(c);
                }
            }
        }
        builder.append(']');
        builder.append(quantifierToString(m_quantifier));
        return builder.toString();
    }
    case TermType::Group: {
        StringBuilder builder;
        builder.append('(');
        for (const Term& term : m_atomData.group.terms)
            builder.append(term.toString());
        builder.append(')');
        builder.append(quantifierToString(m_quantifier));
        return builder.toString();
    }
    }
}
#endif

inline Term::Term()
{
}

inline Term::Term(char character, bool isCaseSensitive)
    : m_termType(TermType::CharacterSet)
{
    new (NotNull, &m_atomData.characterSet) CharacterSet();
    addCharacter(character, isCaseSensitive);
}

inline Term::Term(UniversalTransitionTag)
    : m_termType(TermType::CharacterSet)
{
    new (NotNull, &m_atomData.characterSet) CharacterSet();
    for (UChar i = 1; i < 128; ++i)
        m_atomData.characterSet.set(i);
}

inline Term::Term(CharacterSetTermTag, bool isInverted)
    : m_termType(TermType::CharacterSet)
{
    new (NotNull, &m_atomData.characterSet) CharacterSet();
    if (isInverted)
        m_atomData.characterSet.invert();
}

inline Term::Term(GroupTermTag)
    : m_termType(TermType::Group)
{
    new (NotNull, &m_atomData.group) Group();
}

inline Term::Term(EndOfLineAssertionTermTag)
    : Term(CharacterSetTerm, false)
{
    m_atomData.characterSet.set(0);
}

inline Term::Term(const Term& other)
    : m_termType(other.m_termType)
    , m_quantifier(other.m_quantifier)
{
    switch (m_termType) {
    case TermType::Empty:
        break;
    case TermType::CharacterSet:
        new (NotNull, &m_atomData.characterSet) CharacterSet(other.m_atomData.characterSet);
        break;
    case TermType::Group:
        new (NotNull, &m_atomData.group) Group(other.m_atomData.group);
        break;
    }
}

inline Term::Term(Term&& other)
    : m_termType(WTFMove(other.m_termType))
    , m_quantifier(WTFMove(other.m_quantifier))
{
    switch (m_termType) {
    case TermType::Empty:
        break;
    case TermType::CharacterSet:
        new (NotNull, &m_atomData.characterSet) CharacterSet(WTFMove(other.m_atomData.characterSet));
        break;
    case TermType::Group:
        new (NotNull, &m_atomData.group) Group(WTFMove(other.m_atomData.group));
        break;
    }
    other.destroy();
}

inline Term::Term(EmptyValueTag)
    : m_termType(TermType::Empty)
{
}

inline Term::~Term()
{
    destroy();
}

inline bool Term::isValid() const
{
    return m_termType != TermType::Empty;
}

inline void Term::addCharacter(UChar character, bool isCaseSensitive)
{
    ASSERT(isASCII(character));

    ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::CharacterSet);
    if (m_termType != TermType::CharacterSet)
        return;

    if (isCaseSensitive || !isASCIIAlpha(character))
        m_atomData.characterSet.set(character);
    else {
        m_atomData.characterSet.set(toASCIIUpper(character));
        m_atomData.characterSet.set(toASCIILower(character));
    }
}

inline void Term::extendGroupSubpattern(const Term& term)
{
    ASSERT_WITH_SECURITY_IMPLICATION(m_termType == TermType::Group);
    if (m_termType != TermType::Group)
        return;
    m_atomData.group.terms.append(term);
}

inline void Term::quantify(const AtomQuantifier& quantifier)
{
    ASSERT_WITH_MESSAGE(m_quantifier == AtomQuantifier::One, "Transition to quantified term should only happen once.");
    m_quantifier = quantifier;
}

inline ImmutableCharNFANodeBuilder Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ActionList& finalActions) const
{
    ImmutableCharNFANodeBuilder newEnd(nfa);
    generateGraph(nfa, source, newEnd.nodeId());
    newEnd.setActions(finalActions.begin(), finalActions.end());
    return newEnd;
}

inline void Term::generateGraph(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
{
    ASSERT(isValid());

    switch (m_quantifier) {
    case AtomQuantifier::One: {
        generateSubgraphForAtom(nfa, source, destination);
        break;
    }
    case AtomQuantifier::ZeroOrOne: {
        generateSubgraphForAtom(nfa, source, destination);
        source.addEpsilonTransition(destination);
        break;
    }
    case AtomQuantifier::ZeroOrMore: {
        ImmutableCharNFANodeBuilder repeatStart(nfa);
        source.addEpsilonTransition(repeatStart);

        ImmutableCharNFANodeBuilder repeatEnd(nfa);
        generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
        repeatEnd.addEpsilonTransition(repeatStart);

        repeatEnd.addEpsilonTransition(destination);
        source.addEpsilonTransition(destination);
        break;
    }
    case AtomQuantifier::OneOrMore: {
        ImmutableCharNFANodeBuilder repeatStart(nfa);
        source.addEpsilonTransition(repeatStart);

        ImmutableCharNFANodeBuilder repeatEnd(nfa);
        generateSubgraphForAtom(nfa, repeatStart, repeatEnd);
        repeatEnd.addEpsilonTransition(repeatStart);

        repeatEnd.addEpsilonTransition(destination);
        break;
    }
    }
}

inline bool Term::isEndOfLineAssertion() const
{
    return m_termType == TermType::CharacterSet && m_atomData.characterSet.bitCount() == 1 && m_atomData.characterSet.get(0);
}

inline bool Term::matchesAtLeastOneCharacter() const
{
    ASSERT(isValid());

    if (m_quantifier == AtomQuantifier::ZeroOrOne || m_quantifier == AtomQuantifier::ZeroOrMore)
        return false;
    if (isEndOfLineAssertion())
        return false;

    if (m_termType == TermType::Group) {
        for (const Term& term : m_atomData.group.terms) {
            if (term.matchesAtLeastOneCharacter())
                return true;
        }
        return false;
    }
    return true;
}

inline bool Term::isKnownToMatchAnyString() const
{
    ASSERT(isValid());

    switch (m_termType) {
    case TermType::Empty:
        ASSERT_NOT_REACHED();
        break;
    case TermType::CharacterSet:
        // ".*" is the only simple term matching any string.
        return isUniversalTransition() && m_quantifier == AtomQuantifier::ZeroOrMore;
        break;
    case TermType::Group: {
        // There are infinitely many ways to match anything with groups, we just handle simple cases
        if (m_atomData.group.terms.size() != 1)
            return false;

        const Term& firstTermInGroup = m_atomData.group.terms.first();
        // -(.*) with any quantifier.
        if (firstTermInGroup.isKnownToMatchAnyString())
            return true;

        if (firstTermInGroup.isUniversalTransition()) {
            // -(.)*, (.+)*, (.?)* etc.
            if (m_quantifier == AtomQuantifier::ZeroOrMore)
                return true;

            // -(.+)?.
            if (m_quantifier == AtomQuantifier::ZeroOrOne && firstTermInGroup.m_quantifier == AtomQuantifier::OneOrMore)
                return true;

            // -(.?)+.
            if (m_quantifier == AtomQuantifier::OneOrMore && firstTermInGroup.m_quantifier == AtomQuantifier::ZeroOrOne)
                return true;
        }
        break;
    }
    }
    return false;
}

inline bool Term::hasFixedLength() const
{
    ASSERT(isValid());

    switch (m_termType) {
    case TermType::Empty:
        ASSERT_NOT_REACHED();
        break;
    case TermType::CharacterSet:
        return m_quantifier == AtomQuantifier::One;
    case TermType::Group: {
        if (m_quantifier != AtomQuantifier::One)
            return false;
        for (const Term& term : m_atomData.group.terms) {
            if (!term.hasFixedLength())
                return false;
        }
        return true;
    }
    }
    return false;
}

inline Term& Term::operator=(const Term& other)
{
    destroy();
    new (NotNull, this) Term(other);
    return *this;
}

inline Term& Term::operator=(Term&& other)
{
    destroy();
    new (NotNull, this) Term(WTFMove(other));
    return *this;
}

inline bool Term::operator==(const Term& other) const
{
    if (other.m_termType != m_termType || other.m_quantifier != m_quantifier)
        return false;

    switch (m_termType) {
    case TermType::Empty:
        return true;
    case TermType::CharacterSet:
        return m_atomData.characterSet == other.m_atomData.characterSet;
    case TermType::Group:
        return m_atomData.group == other.m_atomData.group;
    }
    ASSERT_NOT_REACHED();
    return false;
}

inline unsigned Term::hash() const
{
    unsigned primary = static_cast<unsigned>(m_termType) << 16 | static_cast<unsigned>(m_quantifier);
    unsigned secondary = 0;
    switch (m_termType) {
    case TermType::Empty:
        secondary = 52184393;
        break;
    case TermType::CharacterSet:
        secondary = m_atomData.characterSet.hash();
        break;
    case TermType::Group:
        secondary = m_atomData.group.hash();
        break;
    }
    return WTF::pairIntHash(primary, secondary);
}

inline bool Term::isEmptyValue() const
{
    return m_termType == TermType::Empty;
}

inline bool Term::isUniversalTransition() const
{
    ASSERT(isValid());

    switch (m_termType) {
    case TermType::Empty:
        ASSERT_NOT_REACHED();
        break;
    case TermType::CharacterSet:
        return (m_atomData.characterSet.inverted() && !m_atomData.characterSet.bitCount())
            || (!m_atomData.characterSet.inverted() && m_atomData.characterSet.bitCount() == 127 && !m_atomData.characterSet.get(0));
    case TermType::Group:
        return m_atomData.group.terms.size() == 1 && m_atomData.group.terms.first().isUniversalTransition();
    }
    return false;
}

inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, const ImmutableCharNFANodeBuilder& destination) const
{
    generateSubgraphForAtom(nfa, source, destination.nodeId());
}

inline void Term::generateSubgraphForAtom(NFA& nfa, ImmutableCharNFANodeBuilder& source, uint32_t destination) const
{
    switch (m_termType) {
    case TermType::Empty:
        ASSERT_NOT_REACHED();
        source.addEpsilonTransition(destination);
        break;
    case TermType::CharacterSet: {
        if (!m_atomData.characterSet.inverted()) {
            UChar i = 0;
            while (true) {
                while (i < 128 && !m_atomData.characterSet.get(i))
                    ++i;
                if (i == 128)
                    break;

                UChar start = i;
                ++i;
                while (i < 128 && m_atomData.characterSet.get(i))
                    ++i;
                source.addTransition(start, i - 1, destination);
            }
        } else {
            UChar i = 1;
            while (true) {
                while (i < 128 && m_atomData.characterSet.get(i))
                    ++i;
                if (i == 128)
                    break;

                UChar start = i;
                ++i;
                while (i < 128 && !m_atomData.characterSet.get(i))
                    ++i;
                source.addTransition(start, i - 1, destination);
            }
        }
        break;
    }
    case TermType::Group: {
        if (m_atomData.group.terms.isEmpty()) {
            // FIXME: any kind of empty term could be avoided in the parser. This case should turned into an assertion.
            source.addEpsilonTransition(destination);
            return;
        }

        if (m_atomData.group.terms.size() == 1) {
            m_atomData.group.terms.first().generateGraph(nfa, source, destination);
            return;
        }

        ImmutableCharNFANodeBuilder lastTarget = m_atomData.group.terms.first().generateGraph(nfa, source, ActionList());
        for (unsigned i = 1; i < m_atomData.group.terms.size() - 1; ++i) {
            const Term& currentTerm = m_atomData.group.terms[i];
            ImmutableCharNFANodeBuilder newNode = currentTerm.generateGraph(nfa, lastTarget, ActionList());
            lastTarget = WTFMove(newNode);
        }
        const Term& lastTerm = m_atomData.group.terms.last();
        lastTerm.generateGraph(nfa, lastTarget, destination);
        break;
    }
    }
}

inline void Term::destroy()
{
    switch (m_termType) {
    case TermType::Empty:
        break;
    case TermType::CharacterSet:
        m_atomData.characterSet.~CharacterSet();
        break;
    case TermType::Group:
        m_atomData.group.~Group();
        break;
    }
    m_termType = TermType::Empty;
}

#if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING
inline size_t Term::memoryUsed() const
{
    size_t extraMemory = 0;
    if (m_termType == TermType::Group) {
        for (const Term& term : m_atomData.group.terms)
            extraMemory += term.memoryUsed();
    }
    return sizeof(Term) + extraMemory;
}
#endif

} // namespace ContentExtensions
} // namespace WebCore

#endif // ENABLE(CONTENT_EXTENSIONS)