XMLTreeBuilder.cpp   [plain text]


/*
 * Copyright (C) 2011 Apple Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "XMLTreeBuilder.h"

#include "CachedScript.h"
#include "CDATASection.h"
#include "Comment.h"
#include "Document.h"
#include "DocumentFragment.h"
#include "DocumentType.h"
#include "Frame.h"
// FIXME: Why are we including HTML entity information in the XML parser?
#include "HTMLEntitySearch.h"
#include "HTMLEntityTable.h"
#include "NewXMLDocumentParser.h"
#include "ProcessingInstruction.h"
#include "ScriptElement.h"
#include "ScriptSourceCode.h"
#include "XMLNSNames.h"
#include "XMLNames.h"

namespace WebCore {

XMLTreeBuilder::XMLTreeBuilder(NewXMLDocumentParser* parser, Document* document)
    : m_document(document)
    , m_parser(parser)
    , m_isXHTML(false)
    , m_sawFirstElement(false)
{
    m_currentNodeStack.append(NodeStackItem(document));
}

XMLTreeBuilder::XMLTreeBuilder(NewXMLDocumentParser* parser, DocumentFragment* fragment, Element* parent)
    : m_document(fragment->document())
    , m_parser(parser)
    , m_isXHTML(false)
    , m_sawFirstElement(true)
{
    NodeStackItem stackItem(fragment);

    // Figure out namespaces
    Vector<Element*> nodeStack;
    while (parent) {
        nodeStack.append(parent);

        ContainerNode* node = parent->parentNode();
        if (!node || !node->isElementNode())
            break;
        parent = static_cast<Element*>(node);
    }

    if (nodeStack.isEmpty()) {
        m_currentNodeStack.append(stackItem);
        return;
    }

    for (Element* element; !nodeStack.isEmpty(); nodeStack.removeLast()) {
        element = nodeStack.last();
        if (element->hasAttributes()) {
            for (size_t i = 0; i < element->attributeCount(); ++i) {
                Attribute* attr = element->attributeItem(i);
                if (attr->localName() == xmlnsAtom)
                    stackItem.setNamespaceURI(attr->value());
                else if (attr->prefix() == xmlnsAtom)
                    stackItem.setNamespaceURI(attr->localName(), attr->value());
            }
        }
    }

    // If the parent element is not in document tree, there may be no xmlns attribute; just default to the parent's namespace.
    if (stackItem.namespaceURI().isNull() && !parent->inDocument())
        stackItem.setNamespaceURI(parent->namespaceURI());

    m_currentNodeStack.append(stackItem);
}

void XMLTreeBuilder::processToken(const AtomicXMLToken& token)
{
    switch (token.type()) {
    case XMLTokenTypes::Uninitialized:
        ASSERT_NOT_REACHED();
        break;
    case XMLTokenTypes::ProcessingInstruction:
        processProcessingInstruction(token);
        break;
    case XMLTokenTypes::XMLDeclaration:
        processXMLDeclaration(token);
        break;
    case XMLTokenTypes::DOCTYPE:
        processDOCTYPE(token);
        break;
    case XMLTokenTypes::StartTag:
        processStartTag(token);
        break;
    case XMLTokenTypes::EndTag:
        processEndTag(token);
        break;
    case XMLTokenTypes::CDATA:
        processCDATA(token);
        break;
    case XMLTokenTypes::Character:
        processCharacter(token);
        break;
    case XMLTokenTypes::Comment:
        processComment(token);
        break;
    case XMLTokenTypes::Entity:
        processEntity(token);
        break;
    case XMLTokenTypes::EndOfFile:
        exitText();
        return;
    }
}

void XMLTreeBuilder::finish()
{
    exitText();
}

void XMLTreeBuilder::pushCurrentNode(const NodeStackItem& stackItem)
{
    ASSERT(stackItem.node());
    m_currentNodeStack.append(stackItem);
    // FIXME: is there a maximum DOM depth?
}

void XMLTreeBuilder::popCurrentNode()
{
    ASSERT(m_currentNodeStack.size());

    m_currentNodeStack.removeLast();
}

void XMLTreeBuilder::closeElement(PassRefPtr<Element> element)
{
    element->finishParsingChildren();

    ScriptElement* scriptElement = toScriptElement(element.get());
    if (scriptElement)
        m_parser->processScript(scriptElement);

    popCurrentNode();
}

void XMLTreeBuilder::processProcessingInstruction(const AtomicXMLToken& token)
{
    if (!failOnText())
        return;

    // FIXME: fall back if we can't handle the PI ourself.

    add(ProcessingInstruction::create(m_document, token.target(), token.data()));
}

void XMLTreeBuilder::processXMLDeclaration(const AtomicXMLToken& token)
{
    if (!failOnText())
        return;

    ExceptionCode ec = 0;

    m_document->setXMLVersion(String(token.xmlVersion()), ec);
    if (ec)
        m_parser->stopParsing();

    m_document->setXMLStandalone(token.xmlStandalone(), ec);
    if (ec)
        m_parser->stopParsing();
    // FIXME: how should this behave if standalone is not specified?
    // FIXME: set encoding.
}

void XMLTreeBuilder::processDOCTYPE(const AtomicXMLToken& token)
{
    DEFINE_STATIC_LOCAL(AtomicString, xhtmlTransitional, ("-//W3C//DTD XHTML 1.0 Transitional//EN"));
    DEFINE_STATIC_LOCAL(AtomicString, xhtml11, ("-//W3C//DTD XHTML 1.1//EN"));
    DEFINE_STATIC_LOCAL(AtomicString, xhtmlStrict, ("-//W3C//DTD XHTML 1.0 Strict//EN"));
    DEFINE_STATIC_LOCAL(AtomicString, xhtmlFrameset, ("-//W3C//DTD XHTML 1.0 Frameset//EN"));
    DEFINE_STATIC_LOCAL(AtomicString, xhtmlBasic, ("-//W3C//DTD XHTML Basic 1.0//EN"));
    DEFINE_STATIC_LOCAL(AtomicString, xhtmlMathML, ("-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"));
    DEFINE_STATIC_LOCAL(AtomicString, xhtmlMathMLSVG, ("-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"));
    DEFINE_STATIC_LOCAL(AtomicString, xhtmlMobile, ("-//WAPFORUM//DTD XHTML Mobile 1.0//EN"));

    if (!failOnText())
        return;

    AtomicString publicIdentifier(token.publicIdentifier().data(), token.publicIdentifier().size());
    AtomicString systemIdentifier(token.systemIdentifier().data(), token.systemIdentifier().size());
    RefPtr<DocumentType> doctype = DocumentType::create(m_document, token.name(), publicIdentifier, systemIdentifier);
    m_document->setDocType(doctype);
    m_document->parserAddChild(doctype);

    if ((publicIdentifier == xhtmlTransitional)
        || (publicIdentifier == xhtml11)
        || (publicIdentifier == xhtmlStrict)
        || (publicIdentifier == xhtmlFrameset)
        || (publicIdentifier == xhtmlBasic)
        || (publicIdentifier == xhtmlMathML)
        || (publicIdentifier == xhtmlMathMLSVG)
        || (publicIdentifier == xhtmlMobile))
        m_isXHTML = true;
}

void XMLTreeBuilder::processStartTag(const AtomicXMLToken& token)
{
    exitText();

    bool isFirstElement = !m_sawFirstElement;
    m_sawFirstElement = true;

    NodeStackItem top = m_currentNodeStack.last();

    processNamespaces(token, top);

    QualifiedName qName(token.prefix(), token.name(), top.namespaceForPrefix(token.prefix(), top.namespaceURI()));
    RefPtr<Element> newElement = m_document->createElement(qName, true);

    processAttributes(token, top, newElement);

    newElement->beginParsingChildren();
    m_currentNodeStack.last().node()->parserAddChild(newElement.get());

    top.setNode(newElement);
    pushCurrentNode(top);

    if (!newElement->attached())
        newElement->attach();

    if (isFirstElement && m_document->frame())
        m_document->frame()->loader()->dispatchDocumentElementAvailable();

    if (token.selfClosing())
        closeElement(newElement);
}

void XMLTreeBuilder::processEndTag(const AtomicXMLToken& token)
{
    exitText();

    RefPtr<ContainerNode> node = m_currentNodeStack.last().node();

    if (!node->hasTagName(QualifiedName(token.prefix(), token.name(), m_currentNodeStack.last().namespaceForPrefix(token.prefix(), m_currentNodeStack.last().namespaceURI()))))
        m_parser->stopParsing();

    closeElement(toElement(node.get()));
}

void XMLTreeBuilder::processCharacter(const AtomicXMLToken& token)
{
    appendToText(token.characters().data(), token.characters().size());
}

void XMLTreeBuilder::processCDATA(const AtomicXMLToken& token)
{
    exitText();
    add(CDATASection::create(m_document, token.data()));
}

void XMLTreeBuilder::processComment(const AtomicXMLToken& token)
{
    exitText();
    add(Comment::create(m_document, token.comment()));
}

void XMLTreeBuilder::processEntity(const AtomicXMLToken& token)
{
    // FIXME: we should support internal subset.
    if (m_isXHTML)
        processHTMLEntity(token);
    else
        processXMLEntity(token);
}

void XMLTreeBuilder::processNamespaces(const AtomicXMLToken& token, NodeStackItem& stackItem)
{
    for (unsigned i = 0; i < token.attributes().size(); ++i) {
        const Attribute& tokenAttribute = token.attributes().at(i);
        if (tokenAttribute.name().prefix() == xmlnsAtom)
            stackItem.setNamespaceURI(tokenAttribute.name().localName(), tokenAttribute.value());
        else if (tokenAttribute.name() == xmlnsAtom)
            stackItem.setNamespaceURI(tokenAttribute.value());
    }
}

void XMLTreeBuilder::processAttributes(const AtomicXMLToken& token, NodeStackItem& stackItem, PassRefPtr<Element> newElement)
{
    for (unsigned i = 0; i < token.attributes().size(); ++i) {
        const Attribute& tokenAttribute = token.attributes().at(i);
        ExceptionCode ec = 0;
        if (tokenAttribute.name().prefix() == xmlnsAtom)
            newElement->setAttributeNS(XMLNSNames::xmlnsNamespaceURI, "xmlns:" + tokenAttribute.name().localName(), tokenAttribute.value(), ec);
        else if (tokenAttribute.name() == xmlnsAtom)
            newElement->setAttributeNS(XMLNSNames::xmlnsNamespaceURI, xmlnsAtom, tokenAttribute.value(), ec);
        else {
            QualifiedName qName(tokenAttribute.prefix(), tokenAttribute.localName(), stackItem.namespaceForPrefix(tokenAttribute.prefix(), nullAtom));
            newElement->setAttribute(qName, tokenAttribute.value());
        }
        if (ec) {
            m_parser->stopParsing();
            return;
        }
    }
}

void XMLTreeBuilder::processXMLEntity(const AtomicXMLToken& token)
{
    DEFINE_STATIC_LOCAL(AtomicString, amp, ("amp"));
    DEFINE_STATIC_LOCAL(AtomicString, apos, ("apos"));
    DEFINE_STATIC_LOCAL(AtomicString, gt, ("gt"));
    DEFINE_STATIC_LOCAL(AtomicString, lt, ("lt"));
    DEFINE_STATIC_LOCAL(AtomicString, quot, ("quot"));
    DEFINE_STATIC_LOCAL(String, ampS, ("&"));
    DEFINE_STATIC_LOCAL(String, aposS, ("'"));
    DEFINE_STATIC_LOCAL(String, gtS, (">"));
    DEFINE_STATIC_LOCAL(String, ltS, ("<"));
    DEFINE_STATIC_LOCAL(String, quotS, ("\""));

    if (token.name() == amp)
        appendToText(ampS.characters(), 1);
    else if (token.name() == apos)
        appendToText(aposS.characters(), 1);
    else if (token.name() == gt)
        appendToText(gtS.characters(), 1);
    else if (token.name() == lt)
        appendToText(ltS.characters(), 1);
    else if (token.name() == quot)
        appendToText(quotS.characters(), 1);
    else
        m_parser->stopParsing();
}

void XMLTreeBuilder::processHTMLEntity(const AtomicXMLToken& token)
{
    HTMLEntitySearch search;
    const AtomicString& name = token.name();
    for (size_t i = 0; i < name.length(); ++i) {
        search.advance(name[i]);
        if (!search.isEntityPrefix()) {
            m_parser->stopParsing();
            return;
        }
    }
    search.advance(';');
    if (!search.isEntityPrefix()) {
        m_parser->stopParsing();
        return;
    }
    UChar32 entityValue = search.mostRecentMatch()->firstValue;
    // FIXME: We need to account for secondValue if any XML entities are longer
    // than one unicode character.
    ASSERT_NOT_REACHED();
    // Darin Adler writes:
    //   You can see given the code above that this else is dead code. This code is in a strange state.
    //   And the reinterpret_cast to UChar* makes the code little-endian-specific. That is not good!
    if (entityValue <= 0xFFFF)
        appendToText(reinterpret_cast<UChar*>(&entityValue), 1);
    else {
        UChar utf16Pair[2] = { U16_LEAD(entityValue), U16_TRAIL(entityValue) };
        appendToText(utf16Pair, 2);
    }
}

inline void XMLTreeBuilder::add(PassRefPtr<Node> node)
{
    m_currentNodeStack.last().node()->parserAddChild(node.get());
    if (!node->attached())
        node->attach();
}

void XMLTreeBuilder::appendToText(const UChar* text, size_t length)
{
    enterText();

    if (!m_leafText)
        return;

    m_leafText->append(text, length);
}

void XMLTreeBuilder::enterText()
{
    if (!m_sawFirstElement) {
        // FIXME: Guarantee the text is only whitespace.
        return;
    }

    if (!m_leafText)
        m_leafText = adoptPtr(new StringBuilder());
}

void XMLTreeBuilder::exitText()
{
    if (!m_leafText.get())
        return;

    add(Text::create(m_document, m_leafText->toString()));

    m_leafText.clear();
}

bool XMLTreeBuilder::failOnText()
{
    if (!m_leafText)
        return true;

    // FIXME: Guarantee the text is only whitespace.

    m_leafText.clear();
    return true;
}

XMLTreeBuilder::NodeStackItem::NodeStackItem(PassRefPtr<ContainerNode> n, NodeStackItem* parent)
    : m_node(n)
{
    if (!parent) {
        m_scopedNamespaces.set(xmlAtom, XMLNames::xmlNamespaceURI);
        return;
    }

    m_namespace = parent->m_namespace;
    m_scopedNamespaces = parent->m_scopedNamespaces;
}

bool XMLTreeBuilder::NodeStackItem::hasNamespaceURI(AtomicString prefix)
{
    ASSERT(!prefix.isNull());
    return m_scopedNamespaces.contains(prefix);
}

AtomicString XMLTreeBuilder::NodeStackItem::namespaceURI(AtomicString prefix)
{
    ASSERT(!prefix.isNull());
    if (m_scopedNamespaces.contains(prefix))
        return m_scopedNamespaces.get(prefix);
    return nullAtom;
}

void XMLTreeBuilder::NodeStackItem::setNamespaceURI(AtomicString prefix, AtomicString uri)
{
    m_scopedNamespaces.set(prefix, uri);
}

AtomicString XMLTreeBuilder::NodeStackItem::namespaceForPrefix(AtomicString prefix, AtomicString fallback)
{
    AtomicString uri = fallback;
    if (!prefix.isNull() && hasNamespaceURI(prefix))
        uri = namespaceURI(prefix);

    return uri;
}

}