/** * This file is part of the DOM implementation for KDE. * * Copyright (C) 2000 Peter Kelly (pmk@post.com) * Copyright (C) 2005 Apple Computer, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public License * along with this library; see the file COPYING.LIB. If not, write to * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include "xml_tokenizer.h" #include "xml/dom_docimpl.h" #include "xml/dom_textimpl.h" #include "xml/dom_xmlimpl.h" #include "html/html_headimpl.h" #include "html/html_tableimpl.h" #include "misc/htmltags.h" #include "misc/htmlattrs.h" #include "misc/loader.h" #include "KWQLoader.h" #include #include "khtmlview.h" #include "khtml_part.h" #include #include #include #include #include using DOM::DocumentImpl; using DOM::DocumentImpl; using DOM::DOMString; using DOM::ElementImpl; using DOM::HTMLScriptElementImpl; using DOM::HTMLTableSectionElementImpl; using DOM::Node; using DOM::NodeImpl; using DOM::ProcessingInstructionImpl; using DOM::TextImpl; namespace khtml { const int maxErrors = 25; // FIXME: Move to the newer libxml API that handles namespaces and dump XMLNamespace, XMLAttributes, and XMLNamespaceStack. struct XMLNamespace { QString m_prefix; QString m_uri; XMLNamespace* m_parent; int m_ref; XMLNamespace() :m_parent(0), m_ref(0) {} XMLNamespace(const QString& p, const QString& u, XMLNamespace* parent) :m_prefix(p), m_uri(u), m_parent(parent), m_ref(0) { if (m_parent) m_parent->ref(); } QString uriForPrefix(const QString& prefix) { if (prefix == m_prefix) return m_uri; if (m_parent) return m_parent->uriForPrefix(prefix); return ""; } void ref() { m_ref++; } void deref() { if (--m_ref == 0) { if (m_parent) m_parent->deref(); delete this; } } }; class XMLAttributes { public: XMLAttributes() : _ref(0), _length(0), _names(0), _values(0), _uris(0) { } XMLAttributes(const char **expatStyleAttributes); ~XMLAttributes(); XMLAttributes(const XMLAttributes &); XMLAttributes &operator=(const XMLAttributes &); int length() const { return _length; } QString qName(int index) const { return _names[index]; } QString localName(int index) const; QString uri(int index) const { if (!_uris) return QString::null; return _uris[index]; } QString value(int index) const { return _values[index]; } QString value(const QString &) const; void split(XMLNamespace* ns); private: mutable int *_ref; int _length; QString *_names; QString *_values; QString *_uris; }; class XMLNamespaceStack { public: ~XMLNamespaceStack(); XMLNamespace *pushNamespaces(XMLAttributes& attributes); void popNamespaces(); private: QPtrStack m_namespaceStack; }; class XMLTokenizer : public Tokenizer, public CachedObjectClient { public: XMLTokenizer(DocumentImpl *, KHTMLView * = 0); ~XMLTokenizer(); enum ErrorType { warning, nonFatal, fatal }; // from Tokenizer virtual void write(const TokenizerString &str, bool); virtual void finish(); virtual void setOnHold(bool onHold); virtual bool isWaitingForScripts() const; #ifdef KHTML_XSLT void setTransformSource(DocumentImpl* doc); #endif // from CachedObjectClient virtual void notifyFinished(CachedObject *finishedObj); // callbacks from parser SAX void error(ErrorType, const char *message, va_list args); void startElement(const xmlChar *name, const xmlChar **libxmlAttributes); void endElement(); void characters(const xmlChar *s, int len); void processingInstruction(const xmlChar *target, const xmlChar *data); void cdataBlock(const xmlChar *s, int len); void comment(const xmlChar *s); private: void end(); int lineNumber() const; int columnNumber() const; void stopParsing(); void insertErrorMessageBlock(); void executeScripts(); void addScripts(NodeImpl *n); XMLNamespace *pushNamespaces(XMLAttributes& attributes) { return m_namespaceStack.pushNamespaces(attributes); } void popNamespaces() { m_namespaceStack.popNamespaces(); } bool enterText(); void exitText(); DocumentImpl *m_doc; KHTMLView *m_view; QString m_xmlCode; xmlParserCtxtPtr m_context; DOM::NodeImpl *m_currentNode; XMLNamespaceStack m_namespaceStack; bool m_sawError; bool m_parserStopped; bool m_sawXSLTransform; int m_errorCount; int m_lastErrorLine; int m_lastErrorColumn; DOMString m_errorMessages; QPtrList m_scripts; QPtrListIterator *m_scriptsIt; CachedScript *m_cachedScript; }; // -------------------------------- static int globalDescriptor = 0; static int matchFunc(const char* uri) { return 1; // Match everything. } static khtml::DocLoader *globalDocLoader = 0; class OffsetBuffer { public: OffsetBuffer(const QByteArray &b) : m_buffer(b), m_currentOffset(0) { } int readOutBytes(char *outputBuffer, unsigned askedToRead) { unsigned bytesLeft = m_buffer.size() - m_currentOffset; unsigned lenToCopy = kMin(askedToRead, bytesLeft); if (lenToCopy) { memcpy(outputBuffer, m_buffer.data() + m_currentOffset, lenToCopy); m_currentOffset += lenToCopy; } return lenToCopy; } private: QByteArray m_buffer; unsigned m_currentOffset; }; static bool shouldAllowExternalLoad(const char* inURI) { QString url(inURI); if (url.contains("/etc/catalog") || url.startsWith("http://www.w3.org/Graphics/SVG") || url.startsWith("http://www.w3.org/TR/xhtml")) return false; return true; } static void* openFunc(const char* uri) { if (!globalDocLoader || !shouldAllowExternalLoad(uri)) return &globalDescriptor; KURL finalURL; KIO::TransferJob *job = KIO::get(uri, true, false); QString headers; QByteArray data = KWQServeSynchronousRequest(Cache::loader(), globalDocLoader, job, finalURL, headers); return new OffsetBuffer(data); } static int readFunc(void* context, char* buffer, int len) { // Do 0-byte reads in case of a null descriptor if (context == &globalDescriptor) return 0; OffsetBuffer *data = static_cast(context); return data->readOutBytes(buffer, len); } static int writeFunc(void* context, const char* buffer, int len) { // Always just do 0-byte writes return 0; } static int closeFunc(void * context) { if (context != &globalDescriptor) { OffsetBuffer *data = static_cast(context); delete data; } return 0; } void setLoaderForLibXMLCallbacks(DocLoader *docLoader) { globalDocLoader = docLoader; } static xmlParserCtxtPtr createQStringParser(xmlSAXHandlerPtr handlers, void *userData) { static bool didInit = false; if (!didInit) { xmlInitParser(); xmlRegisterInputCallbacks(matchFunc, openFunc, readFunc, closeFunc); xmlRegisterOutputCallbacks(matchFunc, openFunc, writeFunc, closeFunc); didInit = true; } xmlParserCtxtPtr parser = xmlCreatePushParserCtxt(handlers, userData, NULL, 0, NULL); const QChar BOM(0xFEFF); const unsigned char BOMHighByte = *reinterpret_cast(&BOM); xmlSwitchEncoding(parser, BOMHighByte == 0xFF ? XML_CHAR_ENCODING_UTF16LE : XML_CHAR_ENCODING_UTF16BE); return parser; } static void parseQString(xmlParserCtxtPtr parser, const QString &string) { xmlParseChunk(parser, reinterpret_cast(string.unicode()), string.length() * sizeof(QChar), 1); } // -------------------------------- XMLTokenizer::XMLTokenizer(DocumentImpl *_doc, KHTMLView *_view) : m_doc(_doc), m_view(_view), m_context(NULL), m_currentNode(m_doc), m_sawError(false), m_parserStopped(false), m_errorCount(0), m_lastErrorLine(0), m_scriptsIt(0), m_cachedScript(0) { if (m_doc) m_doc->ref(); //FIXME: XMLTokenizer should use this in a fashion similiar to how //HTMLTokenizer uses loadStopped, in the future. loadStopped = false; } XMLTokenizer::~XMLTokenizer() { if (m_doc) m_doc->deref(); delete m_scriptsIt; if (m_cachedScript) m_cachedScript->deref(this); } void XMLTokenizer::write(const TokenizerString &s, bool /*appendData*/ ) { m_xmlCode += s.toString(); } void XMLTokenizer::setOnHold(bool onHold) { // Will we need to implement this when we do incremental XML parsing? } void XMLTokenizer::startElement(const xmlChar *name, const xmlChar **libxmlAttributes) { if (m_parserStopped) return; XMLAttributes atts(reinterpret_cast(libxmlAttributes)); XMLNamespace *ns = pushNamespaces(atts); atts.split(ns); QString qName = QString::fromUtf8(reinterpret_cast(name)); QString uri; QString prefix; int colonPos = qName.find(':'); if (colonPos != -1) { prefix = qName.left(colonPos); } uri = ns->uriForPrefix(prefix); if (m_currentNode->nodeType() == Node::TEXT_NODE) exitText(); int exceptioncode = 0; ElementImpl *newElement = m_doc->createElementNS(uri, qName, exceptioncode); if (!newElement) return; int i; for (i = 0; i < atts.length(); i++) { // FIXME: qualified name not supported for attributes! The prefix has been lost. DOMString uri(atts.uri(i)); DOMString ln(atts.localName(i)); DOMString val(atts.value(i)); NodeImpl::Id id = m_doc->attrId(uri.implementation(), ln.implementation(), false /* allocate */); newElement->setAttribute(id, val.implementation(), exceptioncode); if (exceptioncode) // exception setting attributes return; } // FIXME: This hack ensures implicit table bodies get constructed in XHTML and XML files. // We want to consolidate this with the HTML parser and HTML DOM code at some point. // For now, it's too risky to rip that code up. if (m_currentNode->id() == ID_TABLE && newElement->id() == ID_TR && m_currentNode->isHTMLElement() && newElement->isHTMLElement()) { NodeImpl* implicitTBody = new HTMLTableSectionElementImpl( m_doc, ID_TBODY, true /* implicit */ ); m_currentNode->addChild(implicitTBody); if (m_view && !implicitTBody->attached()) implicitTBody->attach(); m_currentNode = implicitTBody; } if (newElement->isHTMLElement() && newElement->id() == ID_SCRIPT) static_cast(newElement)->setCreatedByParser(true); if (m_currentNode->addChild(newElement)) { if (m_view && !newElement->attached()) newElement->attach(); m_currentNode = newElement; return; } else { delete newElement; return; } // ### DOM spec states: "if there is no markup inside an element's content, the text is contained in a // single object implementing the Text interface that is the only child of the element."... do we // need to ensure that empty elements always have an empty text child? } void XMLTokenizer::endElement() { if (m_parserStopped) return; popNamespaces(); if (m_currentNode->nodeType() == Node::TEXT_NODE) exitText(); if (m_currentNode->parentNode() != 0) { do { m_currentNode = m_currentNode->parentNode(); } while (m_currentNode && m_currentNode->implicitNode()); } // ### else error } void XMLTokenizer::characters(const xmlChar *s, int len) { if (m_parserStopped) return; if (m_currentNode->nodeType() == Node::TEXT_NODE || m_currentNode->nodeType() == Node::CDATA_SECTION_NODE || enterText()) { int exceptioncode = 0; static_cast(m_currentNode)->appendData(QString::fromUtf8(reinterpret_cast(s), len), exceptioncode); } } bool XMLTokenizer::enterText() { NodeImpl *newNode = m_doc->createTextNode(""); if (m_currentNode->addChild(newNode)) { m_currentNode = newNode; return true; } else { delete newNode; return false; } } void XMLTokenizer::exitText() { if (m_view && m_currentNode && !m_currentNode->attached()) m_currentNode->attach(); NodeImpl* par = m_currentNode->parentNode(); if (par != 0) m_currentNode = par; } void XMLTokenizer::error(ErrorType type, const char *message, va_list args) { if (m_parserStopped) { return; } if (type == fatal || (m_errorCount < maxErrors && m_lastErrorLine != lineNumber() && m_lastErrorColumn != columnNumber())) { QString format; switch (type) { case warning: #if APPLE_CHANGES format = QString("warning on line %2 at column %3: %1"); #else format = i18n( "warning: %1 in line %2, column %3\n" ); #endif break; case fatal: #if APPLE_CHANGES // fall through #else format = i18n( "fatal error: %1 in line %2, column %3\n" ); break; #endif default: #if APPLE_CHANGES format = QString("error on line %2 at column %3: %1"); #else format = i18n( "error: %1 in line %2, column %3\n" ); #endif } char *m; vasprintf(&m, message, args); m_errorMessages += format.arg(m).arg(lineNumber()).arg(columnNumber()); free(m); m_lastErrorLine = lineNumber(); m_lastErrorColumn = columnNumber(); ++m_errorCount; } if (type != warning) m_sawError = true; if (type == fatal) stopParsing(); } void XMLTokenizer::processingInstruction(const xmlChar *target, const xmlChar *data) { if (m_parserStopped) { return; } if (m_currentNode->nodeType() == Node::TEXT_NODE) exitText(); // ### handle exceptions ProcessingInstructionImpl *pi = m_doc->createProcessingInstruction( QString::fromUtf8(reinterpret_cast(target)), QString::fromUtf8(reinterpret_cast(data))); m_currentNode->addChild(pi); // don't load stylesheets for standalone documents if (m_doc->part()) { m_sawXSLTransform = !pi->checkStyleSheet(); if (m_sawXSLTransform) // Stop the SAX parser. stopParsing(); } } void XMLTokenizer::cdataBlock(const xmlChar *s, int len) { if (m_parserStopped) { return; } if (m_currentNode->nodeType() == Node::TEXT_NODE) exitText(); NodeImpl *newNode = m_doc->createCDATASection(""); if (m_currentNode->addChild(newNode)) { if (m_view && !newNode->attached()) newNode->attach(); m_currentNode = newNode; } else { delete newNode; return; } characters(s, len); if (m_currentNode->parentNode() != 0) m_currentNode = m_currentNode->parentNode(); } void XMLTokenizer::comment(const xmlChar *s) { if (m_parserStopped) return; if (m_currentNode->nodeType() == Node::TEXT_NODE) exitText(); // ### handle exceptions m_currentNode->addChild(m_doc->createComment(QString::fromUtf8(reinterpret_cast(s)))); } static void startElementHandler(void *userData, const xmlChar *name, const xmlChar **libxmlAttributes) { static_cast(userData)->startElement(name, libxmlAttributes); } static void endElementHandler(void *userData, const xmlChar *name) { static_cast(userData)->endElement(); } static void charactersHandler(void *userData, const xmlChar *s, int len) { static_cast(userData)->characters(s, len); } static void processingInstructionHandler(void *userData, const xmlChar *target, const xmlChar *data) { static_cast(userData)->processingInstruction(target, data); } static void cdataBlockHandler(void *userData, const xmlChar *s, int len) { static_cast(userData)->cdataBlock(s, len); } static void commentHandler(void *userData, const xmlChar *comment) { static_cast(userData)->comment(comment); } static void warningHandler(void *userData, const char *message, ...) { va_list args; va_start(args, message); static_cast(userData)->error(XMLTokenizer::warning, message, args); va_end(args); } static void fatalErrorHandler(void *userData, const char *message, ...) { va_list args; va_start(args, message); static_cast(userData)->error(XMLTokenizer::fatal, message, args); va_end(args); } static void normalErrorHandler(void *userData, const char *message, ...) { va_list args; va_start(args, message); static_cast(userData)->error(XMLTokenizer::nonFatal, message, args); va_end(args); } void XMLTokenizer::finish() { xmlSAXHandler sax; memset(&sax, 0, sizeof(sax)); sax.error = normalErrorHandler; sax.fatalError = fatalErrorHandler; sax.characters = charactersHandler; sax.endElement = endElementHandler; sax.processingInstruction = processingInstructionHandler; sax.startElement = startElementHandler; sax.cdataBlock = cdataBlockHandler; sax.comment = commentHandler; sax.warning = warningHandler; m_parserStopped = false; m_sawError = false; m_sawXSLTransform = false; m_context = createQStringParser(&sax, this); parseQString(m_context, m_xmlCode); xmlFreeParserCtxt(m_context); m_context = NULL; if (m_sawError) insertErrorMessageBlock(); else { // Parsing was successful. Now locate all html