XPathParser.cpp   [plain text]


/*
 * Copyright 2005 Maksim Orlovich <maksim@kde.org>
 * Copyright (C) 2006, 2013 Apple Inc. All rights reserved.
 * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "XPathParser.h"

#include "ExceptionCode.h"
#include "XPathEvaluator.h"
#include "XPathException.h"
#include "XPathNSResolver.h"
#include "XPathPath.h"
#include <wtf/NeverDestroyed.h>
#include <wtf/StdLibExtras.h>
#include <wtf/text/StringHash.h>

using namespace WebCore;
using namespace XPath;

extern int xpathyyparse(Parser&);

#include "XPathGrammar.h"

namespace WebCore {
namespace XPath {

struct Parser::Token {
    int type;
    String string;
    Step::Axis axis;
    NumericOp::Opcode numericOpcode;
    EqTestOp::Opcode equalityTestOpcode;

    Token(int type) : type(type) { }
    Token(int type, const String& string) : type(type), string(string) { }
    Token(int type, Step::Axis axis) : type(type), axis(axis) { }
    Token(int type, NumericOp::Opcode opcode) : type(type), numericOpcode(opcode) { }
    Token(int type, EqTestOp::Opcode opcode) : type(type), equalityTestOpcode(opcode) { }
};

enum XMLCat { NameStart, NameCont, NotPartOfName };

static XMLCat charCat(UChar character)
{
    if (character == '_')
        return NameStart;

    if (character == '.' || character == '-')
        return NameCont;
    unsigned characterTypeMask = U_GET_GC_MASK(character);
    if (characterTypeMask & (U_GC_LU_MASK | U_GC_LL_MASK | U_GC_LO_MASK | U_GC_LT_MASK | U_GC_NL_MASK))
        return NameStart;
    if (characterTypeMask & (U_GC_M_MASK | U_GC_LM_MASK | U_GC_ND_MASK))
        return NameCont;
    return NotPartOfName;
}

static void populateAxisNamesMap(HashMap<String, Step::Axis>& axisNames)
{
    struct AxisName {
        const char* name;
        Step::Axis axis;
    };
    const AxisName axisNameList[] = {
        { "ancestor", Step::AncestorAxis },
        { "ancestor-or-self", Step::AncestorOrSelfAxis },
        { "attribute", Step::AttributeAxis },
        { "child", Step::ChildAxis },
        { "descendant", Step::DescendantAxis },
        { "descendant-or-self", Step::DescendantOrSelfAxis },
        { "following", Step::FollowingAxis },
        { "following-sibling", Step::FollowingSiblingAxis },
        { "namespace", Step::NamespaceAxis },
        { "parent", Step::ParentAxis },
        { "preceding", Step::PrecedingAxis },
        { "preceding-sibling", Step::PrecedingSiblingAxis },
        { "self", Step::SelfAxis }
    };
    for (auto& axisName : axisNameList)
        axisNames.add(axisName.name, axisName.axis);
}

static bool parseAxisName(const String& name, Step::Axis& type)
{
    static NeverDestroyed<HashMap<String, Step::Axis>> axisNames;
    if (axisNames.get().isEmpty())
        populateAxisNamesMap(axisNames);

    auto it = axisNames.get().find(name);
    if (it == axisNames.get().end())
        return false;
    type = it->value;
    return true;
}

// Returns whether the current token can possibly be a binary operator, given
// the previous token. Necessary to disambiguate some of the operators
// (* (multiply), div, and, or, mod) in the [32] Operator rule
// (check http://www.w3.org/TR/xpath#exprlex).
bool Parser::isBinaryOperatorContext() const
{
    switch (m_lastTokenType) {
    case 0:
    case '@': case AXISNAME: case '(': case '[': case ',':
    case AND: case OR: case MULOP:
    case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
    case EQOP: case RELOP:
        return false;
    default:
        return true;
    }
}

void Parser::skipWS()
{
    while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
        ++m_nextPos;
}

Parser::Token Parser::makeTokenAndAdvance(int code, int advance)
{
    m_nextPos += advance;
    return Token(code);
}

Parser::Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
{
    m_nextPos += advance;
    return Token(code, val);
}

Parser::Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
{
    m_nextPos += advance;
    return Token(code, val);
}

// Returns next char if it's there and interesting, 0 otherwise.
char Parser::peekAheadHelper()
{
    if (m_nextPos + 1 >= m_data.length())
        return 0;
    UChar next = m_data[m_nextPos + 1];
    if (next >= 0xff)
        return 0;
    return next;
}

char Parser::peekCurHelper()
{
    if (m_nextPos >= m_data.length())
        return 0;
    UChar next = m_data[m_nextPos];
    if (next >= 0xff)
        return 0;
    return next;
}

Parser::Token Parser::lexString()
{
    UChar delimiter = m_data[m_nextPos];
    int startPos = m_nextPos + 1;

    for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
        if (m_data[m_nextPos] == delimiter) {
            String value = m_data.substring(startPos, m_nextPos - startPos);
            if (value.isNull())
                value = emptyString();
            ++m_nextPos; // Consume the char.
            return Token(LITERAL, value);
        }
    }

    // Ouch, went off the end -- report error.
    return Token(XPATH_ERROR);
}

Parser::Token Parser::lexNumber()
{
    int startPos = m_nextPos;
    bool seenDot = false;

    // Go until end or a non-digits character.
    for (; m_nextPos < m_data.length(); ++m_nextPos) {
        UChar aChar = m_data[m_nextPos];
        if (aChar >= 0xff) break;

        if (aChar < '0' || aChar > '9') {
            if (aChar == '.' && !seenDot)
                seenDot = true;
            else
                break;
        }
    }

    return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
}

bool Parser::lexNCName(String& name)
{
    int startPos = m_nextPos;
    if (m_nextPos >= m_data.length())
        return false;

    if (charCat(m_data[m_nextPos]) != NameStart)
        return false;

    // Keep going until we get a character that's not good for names.
    for (; m_nextPos < m_data.length(); ++m_nextPos)
        if (charCat(m_data[m_nextPos]) == NotPartOfName)
            break;

    name = m_data.substring(startPos, m_nextPos - startPos);
    return true;
}

bool Parser::lexQName(String& name)
{
    String n1;
    if (!lexNCName(n1))
        return false;

    skipWS();

    // If the next character is :, what we just got it the prefix, if not,
    // it's the whole thing.
    if (peekAheadHelper() != ':') {
        name = n1;
        return true;
    }

    String n2;
    if (!lexNCName(n2))
        return false;

    name = n1 + ":" + n2;
    return true;
}

inline Parser::Token Parser::nextTokenInternal()
{
    skipWS();

    if (m_nextPos >= m_data.length())
        return Token(0);

    char code = peekCurHelper();
    switch (code) {
    case '(': case ')': case '[': case ']':
    case '@': case ',': case '|':
        return makeTokenAndAdvance(code);
    case '\'':
    case '\"':
        return lexString();
    case '0': case '1': case '2': case '3': case '4':
    case '5': case '6': case '7': case '8': case '9':
        return lexNumber();
    case '.': {
        char next = peekAheadHelper();
        if (next == '.')
            return makeTokenAndAdvance(DOTDOT, 2);
        if (next >= '0' && next <= '9')
            return lexNumber();
        return makeTokenAndAdvance('.');
    }
    case '/':
        if (peekAheadHelper() == '/')
            return makeTokenAndAdvance(SLASHSLASH, 2);
        return makeTokenAndAdvance('/');
    case '+':
        return makeTokenAndAdvance(PLUS);
    case '-':
        return makeTokenAndAdvance(MINUS);
    case '=':
        return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
    case '!':
        if (peekAheadHelper() == '=')
            return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
        return Token(XPATH_ERROR);
    case '<':
        if (peekAheadHelper() == '=')
            return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
        return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
    case '>':
        if (peekAheadHelper() == '=')
            return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
        return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
    case '*':
        if (isBinaryOperatorContext())
            return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
        ++m_nextPos;
        return Token(NAMETEST, "*");
    case '$': { // $ QName
        m_nextPos++;
        String name;
        if (!lexQName(name))
            return Token(XPATH_ERROR);
        return Token(VARIABLEREFERENCE, name);
    }
    }

    String name;
    if (!lexNCName(name))
        return Token(XPATH_ERROR);

    skipWS();
    // If we're in an operator context, check for any operator names
    if (isBinaryOperatorContext()) {
        if (name == "and") //### hash?
            return Token(AND);
        if (name == "or")
            return Token(OR);
        if (name == "mod")
            return Token(MULOP, NumericOp::OP_Mod);
        if (name == "div")
            return Token(MULOP, NumericOp::OP_Div);
    }

    // See whether we are at a :
    if (peekCurHelper() == ':') {
        m_nextPos++;
        // Any chance it's an axis name?
        if (peekCurHelper() == ':') {
            m_nextPos++;
            
            //It might be an axis name.
            Step::Axis axis;
            if (parseAxisName(name, axis))
                return Token(AXISNAME, axis);
            // Ugh, :: is only valid in axis names -> error
            return Token(XPATH_ERROR);
        }

        // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
        skipWS();
        if (peekCurHelper() == '*') {
            m_nextPos++;
            return Token(NAMETEST, name + ":*");
        }
        
        // Make a full qname.
        String n2;
        if (!lexNCName(n2))
            return Token(XPATH_ERROR);
        
        name = name + ":" + n2;
    }

    skipWS();

    if (peekCurHelper() == '(') {
        // note: we don't swallow the '(' here!

        // Either node type oor function name.

        if (name == "processing-instruction")
            return Token(PI);
        if (name == "node")
            return Token(NODE);
        if (name == "text")
            return Token(TEXT);
        if (name == "comment")
            return Token(COMMENT);

        return Token(FUNCTIONNAME, name);
    }

    // At this point, it must be NAMETEST.
    return Token(NAMETEST, name);
}

inline Parser::Token Parser::nextToken()
{
    Token token = nextTokenInternal();
    m_lastTokenType = token.type;
    return token;
}

Parser::Parser(const String& statement, RefPtr<XPathNSResolver>&& resolver)
    : m_data(statement)
    , m_resolver(WTFMove(resolver))
    , m_nextPos(0)
    , m_lastTokenType(0)
    , m_sawNamespaceError(false)
{
}

int Parser::lex(YYSTYPE& yylval)
{
    Token token = nextToken();

    switch (token.type) {
    case AXISNAME:
        yylval.axis = token.axis;
        break;
    case MULOP:
        yylval.numericOpcode = token.numericOpcode;
        break;
    case RELOP:
    case EQOP:
        yylval.equalityTestOpcode = token.equalityTestOpcode;
        break;
    case NODETYPE:
    case FUNCTIONNAME:
    case LITERAL:
    case VARIABLEREFERENCE:
    case NUMBER:
    case NAMETEST:
        yylval.string = token.string.releaseImpl().leakRef();
        break;
    }

    return token.type;
}

bool Parser::expandQualifiedName(const String& qualifiedName, String& localName, String& namespaceURI)
{
    size_t colon = qualifiedName.find(':');
    if (colon != notFound) {
        if (!m_resolver) {
            m_sawNamespaceError = true;
            return false;
        }
        namespaceURI = m_resolver->lookupNamespaceURI(qualifiedName.left(colon));
        if (namespaceURI.isNull()) {
            m_sawNamespaceError = true;
            return false;
        }
        localName = qualifiedName.substring(colon + 1);
    } else
        localName = qualifiedName;

    return true;
}

std::unique_ptr<Expression> Parser::parseStatement(const String& statement, RefPtr<XPathNSResolver>&& resolver, ExceptionCode& ec)
{
    Parser parser(statement, WTFMove(resolver));

    int parseError = xpathyyparse(parser);

    if (parser.m_sawNamespaceError) {
        ec = NAMESPACE_ERR;
        return nullptr;
    }

    if (parseError) {
        ec = XPathException::INVALID_EXPRESSION_ERR;
        return nullptr;
    }

    return WTFMove(parser.m_result);
}

} }