#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "html/htmltokenizer.h"
#include "html/html_documentimpl.h"
#include "html/htmlparser.h"
#include "html/dtd.h"
#include "misc/loader.h"
#include "misc/htmlhashes.h"
#include "khtmlview.h"
#include "khtml_part.h"
#include "xml/dom_docimpl.h"
#include "css/csshelper.h"
#include "ecma/kjs_proxy.h"
#include <kcharsets.h>
#include <kglobal.h>
#include <ctype.h>
#include <assert.h>
#include <qvariant.h>
#include <kdebug.h>
#include <stdlib.h>
using DOM::AtomicString;
using DOM::AttributeImpl;
using DOM::DOMString;
using DOM::DOMStringImpl;
using DOM::DocumentImpl;
using DOM::FORBIDDEN;
using DOM::Node;
using DOM::emptyAtom;
using DOM::endTagRequirement;
#undef __inline
#define __inline
#include "kentities.c"
#undef __inline
#define TOKENIZER_CHUNK_SIZE 4096
#define TOKENIZER_TIME_DELAY 500
namespace khtml {
static const char commentStart [] = "<!--";
static const char scriptEnd [] = "</script";
static const char xmpEnd [] = "</xmp";
static const char styleEnd [] = "</style";
static const char textareaEnd [] = "</textarea";
static const char titleEnd [] = "</title";
#define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
#define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
#define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
#if APPLE_CHANGES
inline void fixUpChar(QChar& c) {
switch (c.unicode()) {
case 0x0080: c = 0x20AC; break;
case 0x0081: break;
case 0x0082: c = 0x201A; break;
case 0x0083: c = 0x0192; break;
case 0x0084: c = 0x201E; break;
case 0x0085: c = 0x2026; break;
case 0x0086: c = 0x2020; break;
case 0x0087: c = 0x2021; break;
case 0x0088: c = 0x02C6; break;
case 0x0089: c = 0x2030; break;
case 0x008A: c = 0x0160; break;
case 0x008B: c = 0x2039; break;
case 0x008C: c = 0x0152; break;
case 0x008D: break;
case 0x008E: c = 0x017D; break;
case 0x008F: break;
case 0x0090: break;
case 0x0091: c = 0x2018; break;
case 0x0092: c = 0x2019; break;
case 0x0093: c = 0x201C; break;
case 0x0094: c = 0x201D; break;
case 0x0095: c = 0x2022; break;
case 0x0096: c = 0x2013; break;
case 0x0097: c = 0x2014; break;
case 0x0098: c = 0x02DC; break;
case 0x0099: c = 0x2122; break;
case 0x009A: c = 0x0161; break;
case 0x009B: c = 0x203A; break;
case 0x009C: c = 0x0153; break;
case 0x009D: break;
case 0x009E: c = 0x017E; break;
case 0x009F: c = 0x0178; break;
}
}
#else // APPLE_CHANGES
#define fixUpChar(x) \
if (!(x).row() ) { \
switch ((x).cell()) \
{ \
\
case 0x80: (x) = 0x20ac; break; \
case 0x82: (x) = ','; break; \
case 0x83: (x) = 0x0192; break; \
case 0x84: (x) = '"'; break; \
case 0x85: (x) = 0x2026; break; \
case 0x86: (x) = 0x2020; break; \
case 0x87: (x) = 0x2021; break; \
case 0x88: (x) = 0x02C6; break; \
case 0x89: (x) = 0x2030; break; \
case 0x8A: (x) = 0x0160; break; \
case 0x8b: (x) = '<'; break; \
case 0x8C: (x) = 0x0152; break; \
\
case 0x8E: (x) = 0x017D; break; \
\
\
case 0x91: (x) = '\''; break; \
case 0x92: (x) = '\''; break; \
case 0x93: (x) = '"'; break; \
case 0x94: (x) = '"'; break; \
case 0x95: (x) = '*'; break; \
case 0x96: (x) = '-'; break; \
case 0x97: (x) = '-'; break; \
case 0x98: (x) = '~'; break; \
case 0x99: (x) = 0x2122; break; \
case 0x9A: (x) = 0x0161; break; \
case 0x9b: (x) = '>'; break; \
case 0x9C: (x) = 0x0153; break; \
\
case 0x9E: (x) = 0x017E; break; \
case 0x9F: (x) = 0x0178; break; \
\
case 0xb7: (x) = '*'; break; \
default: break; \
} \
} \
else { \
\
switch( (x).unicode() ) { \
case 0x2013: (x) = '-'; break; \
case 0x2014: (x) = '-'; break; \
case 0x2018: (x) = '\''; break; \
case 0x2019: (x) = '\''; break; \
case 0x201c: (x) = '"'; break; \
case 0x201d: (x) = '"'; break; \
case 0x2022: (x) = '*'; break; \
case 0x2122: (x) = 0x2122; break; \
default: break; \
} \
}
#endif // APPLE_CHANGES
inline bool tagMatch(const char *s1, const QChar *s2, uint length)
{
for (uint i = 0; i != length; ++i) {
char c1 = s1[i];
char uc1 = toupper(c1);
QChar c2 = s2[i];
if (c1 != c2 && uc1 != c2)
return false;
}
return true;
}
HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view, bool includesComments)
: inWrite(false)
{
view = _view;
buffer = 0;
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
charsets = KGlobal::charsets();
parser = new KHTMLParser(_view, _doc, includesComments);
m_executingScript = 0;
loadingExtScript = false;
onHold = false;
attrNamePresent = false;
timerId = 0;
includesCommentsInDOM = includesComments;
loadStopped = false;
begin();
}
HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i, bool includesComments)
: inWrite(false)
{
view = 0;
buffer = 0;
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
charsets = KGlobal::charsets();
parser = new KHTMLParser(i, _doc, includesComments);
m_executingScript = 0;
loadingExtScript = false;
onHold = false;
timerId = 0;
includesCommentsInDOM = includesComments;
loadStopped = false;
begin();
}
void HTMLTokenizer::reset()
{
assert(m_executingScript == 0);
assert(onHold == false);
while (!cachedScript.isEmpty())
cachedScript.dequeue()->deref(this);
if ( buffer )
KHTML_DELETE_QCHAR_VEC(buffer);
buffer = dest = 0;
size = 0;
if ( scriptCode )
KHTML_DELETE_QCHAR_VEC(scriptCode);
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
if (timerId) {
killTimer(timerId);
timerId = 0;
}
timerId = 0;
allowYield = false;
forceSynchronous = false;
currToken.reset();
}
void HTMLTokenizer::begin()
{
m_executingScript = 0;
loadingExtScript = false;
onHold = false;
reset();
size = 254;
buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
dest = buffer;
tag = NoTag;
pending = NonePending;
discard = NoneDiscard;
pre = false;
prePos = 0;
plaintext = false;
xmp = false;
processingInstruction = false;
script = false;
escaped = false;
style = false;
skipLF = false;
select = false;
comment = false;
server = false;
textarea = false;
title = false;
startTag = false;
tquote = NoQuote;
searchCount = 0;
Entity = NoEntity;
loadingExtScript = false;
scriptSrc = QString::null;
pendingSrc.clear();
currentPrependingSrc = 0;
noMoreData = false;
brokenComments = false;
brokenServer = false;
lineno = 0;
scriptStartLineno = 0;
tagStartLineno = 0;
forceSynchronous = false;
}
void HTMLTokenizer::setForceSynchronous(bool force)
{
forceSynchronous = force;
}
void HTMLTokenizer::processListing(TokenizerString list)
{
bool old_pre = pre;
if(!style) pre = true;
prePos = 0;
while ( !list.isEmpty() )
{
checkBuffer(3*TAB_SIZE);
if (skipLF && ( *list != '\n' ))
{
skipLF = false;
}
if (skipLF)
{
skipLF = false;
++list;
}
else if (( *list == '\n' ) || ( *list == '\r' ))
{
if (discard == LFDiscard)
{
discard = NoneDiscard; }
else
{
if (pending)
addPending();
pending = LFPending;
}
if (*list == '\r')
{
skipLF = true;
}
++list;
}
else if (( *list == ' ' ) || ( *list == '\t'))
{
if (pending)
addPending();
if (*list == ' ')
pending = SpacePending;
else
pending = TabPending;
++list;
}
else
{
discard = NoneDiscard;
if (pending)
addPending();
prePos++;
*dest++ = *list;
++list;
}
}
if (pending)
addPending();
prePos = 0;
pre = old_pre;
}
void HTMLTokenizer::parseSpecial(TokenizerString &src)
{
assert( textarea || title || !Entity );
assert( !tag );
assert( xmp+textarea+title+style+script == 1 );
if (script)
scriptStartLineno = lineno+src.lineCount();
if ( comment ) parseComment( src );
while ( !src.isEmpty() ) {
checkScriptBuffer();
unsigned char ch = src->latin1();
if ( !scriptCodeResync && !brokenComments && !textarea && !xmp && !title && ch == '-' && scriptCodeSize >= 3 && !src.escaped() && scriptCode[scriptCodeSize-3] == '<' && scriptCode[scriptCodeSize-2] == '!' && scriptCode[scriptCodeSize-1] == '-' ) {
comment = true;
parseComment( src );
continue;
}
if ( scriptCodeResync && !tquote && ( ch == '>' ) ) {
++src;
scriptCodeSize = scriptCodeResync-1;
scriptCodeResync = 0;
scriptCode[ scriptCodeSize ] = scriptCode[ scriptCodeSize + 1 ] = 0;
if ( script )
scriptHandler();
else {
processListing(TokenizerString(scriptCode, scriptCodeSize));
processToken();
if ( style ) { currToken.id = ID_STYLE + ID_CLOSE_TAG; }
else if ( textarea ) { currToken.id = ID_TEXTAREA + ID_CLOSE_TAG; }
else if ( title ) { currToken.id = ID_TITLE + ID_CLOSE_TAG; }
else if ( xmp ) { currToken.id = ID_XMP + ID_CLOSE_TAG; }
processToken();
style = script = style = textarea = title = xmp = false;
tquote = NoQuote;
scriptCodeSize = scriptCodeResync = 0;
}
return;
}
if ( !scriptCodeResync && !escaped && !src.escaped() && ( ch == '>' || ch == '/' || ch <= ' ' ) && ch &&
scriptCodeSize >= searchStopperLen &&
tagMatch( searchStopper, scriptCode+scriptCodeSize-searchStopperLen, searchStopperLen )) {
scriptCodeResync = scriptCodeSize-searchStopperLen+1;
tquote = NoQuote;
continue;
}
if ( scriptCodeResync && !escaped ) {
if(ch == '\"')
tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
else if(ch == '\'')
tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
tquote = NoQuote;
}
escaped = ( !escaped && ch == '\\' );
if (!scriptCodeResync && (textarea||title) && !src.escaped() && ch == '&') {
QChar *scriptCodeDest = scriptCode+scriptCodeSize;
++src;
parseEntity(src,scriptCodeDest,true);
scriptCodeSize = scriptCodeDest-scriptCode;
}
else {
scriptCode[scriptCodeSize] = *src;
fixUpChar(scriptCode[scriptCodeSize]);
++scriptCodeSize;
++src;
}
}
}
void HTMLTokenizer::scriptHandler()
{
bool doScriptExec = false;
CachedScript* cs = 0;
if (!scriptSrc.isEmpty() && parser->doc()->part()) {
if ( !parser->skipMode() ) {
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Requesting script at time %d\n", parser->doc()->elapsedTime());
#endif
if ( (cs = parser->doc()->docLoader()->requestScript(scriptSrc, scriptSrcCharset) ))
cachedScript.enqueue(cs);
}
scriptSrc=QString::null;
}
else {
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "---START SCRIPT---" << endl;
kdDebug( 6036 ) << QString(scriptCode, scriptCodeSize) << endl;
kdDebug( 6036 ) << "---END SCRIPT---" << endl;
#endif
doScriptExec = true;
}
processListing(TokenizerString(scriptCode, scriptCodeSize));
QString exScript( buffer, dest-buffer );
processToken();
currToken.id = ID_SCRIPT + ID_CLOSE_TAG;
processToken();
TokenizerString *savedPrependingSrc = currentPrependingSrc;
TokenizerString prependingSrc;
currentPrependingSrc = &prependingSrc;
if ( !parser->skipMode() ) {
if (cs) {
if (savedPrependingSrc) {
savedPrependingSrc->append(src);
} else {
pendingSrc.prepend(src);
}
setSrc(TokenizerString());
scriptCodeSize = scriptCodeResync = 0;
cs->ref(this);
if (!cachedScript.isEmpty())
loadingExtScript = true;
}
else if (view && doScriptExec && javascript ) {
if (!m_executingScript)
pendingSrc.prepend(src);
else
prependingSrc = src;
setSrc(TokenizerString());
scriptCodeSize = scriptCodeResync = 0;
scriptExecution( exScript, QString::null, scriptStartLineno );
}
}
script = false;
scriptCodeSize = scriptCodeResync = 0;
if ( !m_executingScript && !loadingExtScript ) {
src.append(pendingSrc);
pendingSrc.clear();
} else if (!prependingSrc.isEmpty()) {
currentPrependingSrc = savedPrependingSrc;
if (loadingExtScript) {
if (currentPrependingSrc) {
currentPrependingSrc->append(prependingSrc);
} else {
pendingSrc.prepend(prependingSrc);
}
} else {
write(prependingSrc, false);
}
}
currentPrependingSrc = savedPrependingSrc;
}
void HTMLTokenizer::scriptExecution( const QString& str, QString scriptURL,
int baseLine)
{
#if APPLE_CHANGES
if (!view || !view->part())
return;
#endif
bool oldscript = script;
m_executingScript++;
script = false;
QString url;
if (scriptURL.isNull())
url = static_cast<DocumentImpl*>(view->part()->document().handle())->URL();
else
url = scriptURL;
TokenizerString *savedPrependingSrc = currentPrependingSrc;
TokenizerString prependingSrc;
currentPrependingSrc = &prependingSrc;
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("beginning script execution at %d\n", parser->doc()->elapsedTime());
#endif
view->part()->executeScript(url,baseLine,Node(),str);
allowYield = true;
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("ending script execution at %d\n", parser->doc()->elapsedTime());
#endif
m_executingScript--;
script = oldscript;
if ( !m_executingScript && !loadingExtScript ) {
src.append(pendingSrc);
pendingSrc.clear();
} else if (!prependingSrc.isEmpty()) {
currentPrependingSrc = savedPrependingSrc;
if (loadingExtScript) {
if (currentPrependingSrc) {
currentPrependingSrc->append(prependingSrc);
} else {
pendingSrc.prepend(prependingSrc);
}
} else {
write(prependingSrc, false);
}
}
currentPrependingSrc = savedPrependingSrc;
}
void HTMLTokenizer::parseComment(TokenizerString &src)
{
checkScriptBuffer(src.length());
while ( !src.isEmpty() ) {
scriptCode[ scriptCodeSize++ ] = *src;
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("comment is now: *%s*",
QConstString((QChar*)src.current(), QMIN(16, src.length())).string().latin1());
#endif
if (src->unicode() == '>') {
bool handleBrokenComments = brokenComments && !(script || style);
int endCharsCount = 1; if (scriptCodeSize > 2 && scriptCode[scriptCodeSize-3] == '-' && scriptCode[scriptCodeSize-2] == '-') {
endCharsCount = 3;
}
else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' &&
scriptCode[scriptCodeSize-2] == '!') {
endCharsCount = 4;
}
if (handleBrokenComments || endCharsCount > 1) {
++src;
if (!( script || xmp || textarea || style)) {
if (includesCommentsInDOM) {
checkScriptBuffer();
scriptCode[ scriptCodeSize ] = 0;
scriptCode[ scriptCodeSize + 1 ] = 0;
currToken.id = ID_COMMENT;
processListing(TokenizerString(scriptCode, scriptCodeSize - endCharsCount));
processToken();
currToken.id = ID_COMMENT + ID_CLOSE_TAG;
processToken();
}
scriptCodeSize = 0;
}
comment = false;
return; }
}
++src;
}
}
void HTMLTokenizer::parseServer(TokenizerString &src)
{
checkScriptBuffer(src.length());
while ( !src.isEmpty() ) {
scriptCode[ scriptCodeSize++ ] = *src;
if (src->unicode() == '>' &&
scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
++src;
server = false;
scriptCodeSize = 0;
return; }
++src;
}
}
void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
{
char oldchar = 0;
while ( !src.isEmpty() )
{
unsigned char chbegin = src->latin1();
if(chbegin == '\'') {
tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
}
else if(chbegin == '\"') {
tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
}
else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
{
processingInstruction = false;
++src;
discard=LFDiscard;
return; }
++src;
oldchar = chbegin;
}
}
void HTMLTokenizer::parseText(TokenizerString &src)
{
while ( !src.isEmpty() )
{
checkBuffer();
unsigned char chbegin = src->latin1();
if (skipLF && ( chbegin != '\n' ))
{
skipLF = false;
}
if (skipLF)
{
skipLF = false;
++src;
}
else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
{
if (chbegin == '\r')
skipLF = true;
*dest++ = '\n';
++src;
}
else {
*dest = *src;
fixUpChar(*dest);
++dest;
++src;
}
}
}
void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
{
if( start )
{
cBufferPos = 0;
Entity = SearchEntity;
EntityUnicodeValue = 0;
}
while( !src.isEmpty() )
{
ushort cc = src->unicode();
switch(Entity) {
case NoEntity:
assert(Entity != NoEntity);
return;
case SearchEntity:
if(cc == '#') {
cBuffer[cBufferPos++] = cc;
++src;
Entity = NumericSearch;
}
else
Entity = EntityName;
break;
case NumericSearch:
if(cc == 'x' || cc == 'X') {
cBuffer[cBufferPos++] = cc;
++src;
Entity = Hexadecimal;
}
else if(cc >= '0' && cc <= '9')
Entity = Decimal;
else
Entity = SearchSemicolon;
break;
case Hexadecimal:
{
int ll = kMin(src.length(), 8);
while(ll--) {
QChar csrc(src->lower());
cc = csrc.cell();
if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
break;
}
EntityUnicodeValue = EntityUnicodeValue*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
cBuffer[cBufferPos++] = cc;
++src;
}
Entity = SearchSemicolon;
break;
}
case Decimal:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
cc = src->cell();
if(src->row() || !(cc >= '0' && cc <= '9')) {
Entity = SearchSemicolon;
break;
}
EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
cBuffer[cBufferPos++] = cc;
++src;
}
if(cBufferPos == 9) Entity = SearchSemicolon;
break;
}
case EntityName:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
QChar csrc = *src;
cc = csrc.cell();
if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
(cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
Entity = SearchSemicolon;
break;
}
cBuffer[cBufferPos++] = cc;
++src;
}
if(cBufferPos == 9) Entity = SearchSemicolon;
if(Entity == SearchSemicolon) {
if(cBufferPos > 1) {
const entity *e = findEntity(cBuffer, cBufferPos);
if(e)
EntityUnicodeValue = e->code;
if(tag && EntityUnicodeValue > 255 && *src != ';')
EntityUnicodeValue = 0;
}
}
else
break;
}
case SearchSemicolon:
if ((EntityUnicodeValue > 0 && EntityUnicodeValue < 0xD800)
|| (EntityUnicodeValue >= 0xE000 && EntityUnicodeValue <= 0x1FFFFF)) {
if (*src == ';')
++src;
if (EntityUnicodeValue <= 0xFFFF) {
QChar c(EntityUnicodeValue);
fixUpChar(c);
checkBuffer();
src.push(c);
} else {
QChar c1(0xD800 | (((EntityUnicodeValue >> 16) - 1) << 6) | ((EntityUnicodeValue >> 10) & 0x3F));
QChar c2(0xDC00 | (EntityUnicodeValue & 0x3FF));
checkBuffer(2);
src.push(c1);
src.push(c2);
}
} else {
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "unknown entity!" << endl;
#endif
checkBuffer(10);
*dest++ = '&';
for(unsigned int i = 0; i < cBufferPos; i++)
dest[i] = cBuffer[i];
dest += cBufferPos;
if (pre)
prePos += cBufferPos+1;
}
Entity = NoEntity;
return;
}
}
}
void HTMLTokenizer::parseTag(TokenizerString &src)
{
assert(!Entity );
while ( !src.isEmpty() )
{
checkBuffer();
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
uint l = 0;
while(l < src.length() && (*(src.current()+l)).latin1() != '>')
l++;
qDebug("src is now: *%s*, tquote: %d",
QConstString((QChar*)src.current(), l).string().latin1(), tquote);
#endif
switch(tag) {
case NoTag:
{
return;
}
case TagName:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("TagName");
#endif
if (searchCount > 0)
{
if (*src == commentStart[searchCount])
{
searchCount++;
if (searchCount == 4)
{
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "Found comment" << endl;
#endif
++src;
dest = buffer; comment = true;
tag = NoTag;
if (!src.isEmpty() && *src == '>' && parser->doc()->inCompatMode()) {
comment = false;
++src;
if (!src.isEmpty())
cBuffer[cBufferPos++] = src->cell();
}
else
parseComment(src);
return; }
cBuffer[cBufferPos++] = src->cell();
++src;
break;
}
else
searchCount = 0; }
bool finish = false;
unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
while(ll--) {
ushort curchar = *src;
if(curchar <= ' ' || curchar == '>' ) {
finish = true;
break;
}
cBuffer[cBufferPos++] = tolower(curchar);
++src;
}
if(finish || CBUFLEN == cBufferPos) {
bool beginTag;
char* ptr = cBuffer;
unsigned int len = cBufferPos;
cBuffer[cBufferPos] = '\0';
if ((cBufferPos > 0) && (*ptr == '/'))
{
beginTag = false;
ptr++;
len--;
}
else
beginTag = true;
if(len > 1 && ptr[len-1] == '/' )
ptr[--len] = '\0';
unsigned short tagID = getTagID(ptr, len);
if (!tagID) {
DOMString tagName(ptr);
DocumentImpl *doc = parser->docPtr()->document();
if (doc->isValidName(tagName))
tagID = parser->docPtr()->document()->tagId(0, tagName.implementation(), false);
}
if (tagID) {
#ifdef TOKEN_DEBUG
QCString tmp(ptr, len+1);
kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;
#endif
currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG;
}
dest = buffer;
tag = SearchAttribute;
cBufferPos = 0;
}
break;
}
case SearchAttribute:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("SearchAttribute");
#endif
bool atespace = false;
ushort curchar;
while(!src.isEmpty()) {
curchar = *src;
if (curchar > ' ' && curchar != '\'' && curchar != '"') {
if (curchar == '<' || curchar == '>')
tag = SearchEnd;
else
tag = AttributeName;
cBufferPos = 0;
break;
}
atespace = true;
++src;
}
break;
}
case AttributeName:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("AttributeName");
#endif
ushort curchar;
int ll = kMin(src.length(), CBUFLEN-cBufferPos);
while(ll--) {
curchar = *src;
if(curchar <= '>') {
if(curchar <= ' ' || curchar == '=' || curchar == '>') {
unsigned int a;
cBuffer[cBufferPos] = '\0';
a = getAttrID(cBuffer, cBufferPos);
if (a)
attrNamePresent = true;
else {
attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
attrNamePresent = !attrName.isEmpty();
if (currToken.id == ID_SCRIPT && curchar == '>' &&
attrName == "/")
currToken.flat = true;
}
dest = buffer;
*dest++ = a;
#ifdef TOKEN_DEBUG
if (!a || (cBufferPos && *cBuffer == '!'))
kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl;
else
kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;
#endif
tag = SearchEqual;
break;
}
}
cBuffer[cBufferPos++] = tolower(curchar);
++src;
}
if ( cBufferPos == CBUFLEN ) {
cBuffer[cBufferPos] = '\0';
attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data());
attrNamePresent = !attrName.isEmpty();
dest = buffer;
*dest++ = 0;
tag = SearchEqual;
}
break;
}
case SearchEqual:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("SearchEqual");
#endif
ushort curchar;
bool atespace = false;
while(!src.isEmpty()) {
curchar = src->unicode();
if (curchar > ' ' && curchar != '\'' && curchar != '"') {
if(curchar == '=') {
#ifdef TOKEN_DEBUG
kdDebug(6036) << "found equal" << endl;
#endif
tag = SearchValue;
++src;
}
else {
currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, emptyAtom);
dest = buffer;
tag = SearchAttribute;
}
break;
}
atespace = true;
++src;
}
break;
}
case SearchValue:
{
ushort curchar;
while(!src.isEmpty()) {
curchar = src->unicode();
if(curchar > ' ') {
if(( curchar == '\'' || curchar == '\"' )) {
tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
tag = QuotedValue;
++src;
} else
tag = Value;
break;
}
++src;
}
break;
}
case QuotedValue:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("QuotedValue");
#endif
ushort curchar;
while(!src.isEmpty()) {
checkBuffer();
curchar = src->unicode();
if (curchar == '>' && !attrNamePresent) {
while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
dest--; AtomicString v(buffer+1, dest-buffer-1);
attrName.setUnicode(buffer+1,dest-buffer-1);
currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
tag = SearchAttribute;
dest = buffer;
tquote = NoQuote;
break;
}
if(curchar <= '\'' && !src.escaped()) {
if ( curchar == '&' )
{
++src;
parseEntity(src, dest, true);
break;
}
else if ( (tquote == SingleQuote && curchar == '\'') ||
(tquote == DoubleQuote && curchar == '\"') )
{
while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r'))
dest--; AtomicString v(buffer+1, dest-buffer-1);
if (!attrNamePresent)
attrName.setUnicode(buffer+1,dest-buffer-1);
currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
dest = buffer;
tag = SearchAttribute;
tquote = NoQuote;
++src;
break;
}
}
*dest = *src;
fixUpChar(*dest);
++dest;
++src;
}
break;
}
case Value:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("Value");
#endif
ushort curchar;
while(!src.isEmpty()) {
checkBuffer();
curchar = src->unicode();
if(curchar <= '>' && !src.escaped()) {
if ( curchar == '&' )
{
++src;
parseEntity(src, dest, true);
break;
}
if ( curchar <= ' ' || curchar == '>' )
{
AtomicString v(buffer+1, dest-buffer-1);
currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v);
dest = buffer;
tag = SearchAttribute;
break;
}
}
*dest = *src;
fixUpChar(*dest);
++dest;
++src;
}
break;
}
case SearchEnd:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("SearchEnd");
#endif
while(!src.isEmpty()) {
if (*src == '>' || *src == '<')
break;
if (*src == '/')
currToken.flat = true;
++src;
}
if (src.isEmpty()) break;
searchCount = 0; tag = NoTag;
tquote = NoQuote;
if (*src != '<')
++src;
if ( !currToken.id ) return;
uint tagID = currToken.id;
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0
kdDebug( 6036 ) << "appending Tag: " << tagID << endl;
#endif
bool beginTag = !currToken.flat && (tagID <= ID_CLOSE_TAG);
if (tagID > ID_CLOSE_TAG)
tagID -= ID_CLOSE_TAG;
else if (tagID == ID_SCRIPT) {
AttributeImpl* a = 0;
bool foundTypeAttribute = false;
scriptSrc = QString::null;
scriptSrcCharset = QString::null;
if ( currToken.attrs &&
parser->doc()->part() &&
parser->doc()->part()->jScriptEnabled() &&
view
) {
if ( ( a = currToken.attrs->getAttributeItem( ATTR_SRC ) ) )
scriptSrc = parser->doc()->completeURL(parseURL( a->value() ).string() );
if ( ( a = currToken.attrs->getAttributeItem( ATTR_CHARSET ) ) )
scriptSrcCharset = a->value().string().stripWhiteSpace();
if ( scriptSrcCharset.isEmpty() )
scriptSrcCharset = parser->doc()->part()->encoding();
if ((a = currToken.attrs->getAttributeItem(ATTR_TYPE)) != 0 && !a->value().string().isEmpty())
foundTypeAttribute = true;
else
a = currToken.attrs->getAttributeItem(ATTR_LANGUAGE);
}
javascript = true;
if( foundTypeAttribute ) {
QString type = a->value().string().stripWhiteSpace().lower();
if( type.compare("application/x-javascript") != 0 &&
type.compare("text/javascript") != 0 &&
type.compare("text/javascript1.0") != 0 &&
type.compare("text/javascript1.1") != 0 &&
type.compare("text/javascript1.2") != 0 &&
type.compare("text/javascript1.3") != 0 &&
type.compare("text/javascript1.4") != 0 &&
type.compare("text/javascript1.5") != 0 &&
type.compare("text/jscript") != 0 &&
type.compare("text/ecmascript") != 0 &&
type.compare("text/livescript") )
javascript = false;
} else if( a ) {
QString lang = a->value().string();
lang = lang.lower();
if( lang.compare("") != 0 &&
lang.compare("javascript") != 0 &&
lang.compare("javascript1.0") != 0 &&
lang.compare("javascript1.1") != 0 &&
lang.compare("javascript1.2") != 0 &&
lang.compare("javascript1.3") != 0 &&
lang.compare("javascript1.4") != 0 &&
lang.compare("javascript1.5") != 0 &&
lang.compare("ecmascript") != 0 &&
lang.compare("livescript") != 0 &&
lang.compare("jscript") )
javascript = false;
}
}
processToken();
if(pre && beginTag && !DOM::checkChild(ID_PRE, tagID)) {
kdDebug(6036) << " not allowed in <pre> " << (int)tagID << endl;
pre = false;
}
switch( tagID ) {
case ID_PRE:
prePos = 0;
pre = beginTag;
break;
case ID_SCRIPT:
if (beginTag) {
searchStopper = scriptEnd;
searchStopperLen = 8;
script = true;
parseSpecial(src);
}
else if (tagID <= ID_CLOSE_TAG) scriptHandler();
break;
case ID_STYLE:
if (beginTag) {
searchStopper = styleEnd;
searchStopperLen = 7;
style = true;
parseSpecial(src);
}
break;
case ID_TEXTAREA:
if(beginTag) {
searchStopper = textareaEnd;
searchStopperLen = 10;
textarea = true;
parseSpecial(src);
}
break;
case ID_TITLE:
if (beginTag) {
searchStopper = titleEnd;
searchStopperLen = 7;
title = true;
parseSpecial(src);
}
break;
case ID_XMP:
if (beginTag) {
searchStopper = xmpEnd;
searchStopperLen = 5;
xmp = true;
parseSpecial(src);
}
break;
case ID_SELECT:
select = beginTag;
break;
case ID_PLAINTEXT:
plaintext = beginTag;
break;
}
if (beginTag && endTagRequirement(tagID) == FORBIDDEN)
discard = NoneDiscard;
return; }
} }
return;
}
void HTMLTokenizer::addPending()
{
if ( select && !script )
{
*dest++ = ' ';
}
else if ( textarea || script )
{
switch(pending) {
case LFPending: *dest++ = '\n'; prePos = 0; break;
case SpacePending: *dest++ = ' '; ++prePos; break;
case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break;
case NonePending:
assert(0);
}
}
else
{
int p;
switch (pending)
{
case SpacePending:
*dest++ = QChar(' ');
prePos++;
break;
case LFPending:
*dest = '\n';
dest++;
prePos = 0;
break;
case TabPending:
p = TAB_SIZE - ( prePos % TAB_SIZE );
#ifdef TOKEN_DEBUG
qDebug("tab pending, prePos: %d, toadd: %d", prePos, p);
#endif
for ( int x = 0; x < p; x++ )
*dest++ = QChar(' ');
prePos += p;
break;
case NonePending:
assert(0);
break;
}
}
pending = NonePending;
}
void HTMLTokenizer::write(const TokenizerString &str, bool appendData)
{
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << this << " Tokenizer::write(\"" << str << "\"," << appendData << ")" << endl;
#endif
if (!buffer)
return;
if (loadStopped)
return;
if ( ( m_executingScript && appendData ) || !cachedScript.isEmpty() ) {
if (currentPrependingSrc) {
currentPrependingSrc->append(str);
} else {
pendingSrc.append(str);
}
return;
}
if ( onHold ) {
src.append(str);
return;
}
if (!src.isEmpty())
src.append(str);
else
setSrc(str);
if (timerId)
return;
bool wasInWrite = inWrite;
inWrite = true;
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Beginning write at time %d\n", parser->doc()->elapsedTime());
#endif
int processedCount = 0;
QTime startTime;
startTime.start();
KWQUIEventTime eventTime;
while (!src.isEmpty() && (!parser->doc()->part() || !parser->doc()->part()->isScheduledLocationChangePending())) {
if (!continueProcessing(processedCount, startTime, eventTime))
break;
checkBuffer();
ushort cc = src->unicode();
if (skipLF && (cc != '\n'))
skipLF = false;
if (skipLF) {
skipLF = false;
++src;
}
else if ( Entity )
parseEntity( src, dest );
else if ( plaintext )
parseText( src );
else if (script)
parseSpecial(src);
else if (style)
parseSpecial(src);
else if (xmp)
parseSpecial(src);
else if (textarea)
parseSpecial(src);
else if (title)
parseSpecial(src);
else if (comment)
parseComment(src);
else if (server)
parseServer(src);
else if (processingInstruction)
parseProcessingInstruction(src);
else if (tag)
parseTag(src);
else if ( startTag )
{
startTag = false;
switch(cc) {
case '/':
break;
case '!':
{
searchCount = 1;
break;
}
case '?':
{
processingInstruction = true;
tquote = NoQuote;
parseProcessingInstruction(src);
continue;
break;
}
case '%':
if (!brokenServer) {
server = true;
tquote = NoQuote;
parseServer(src);
continue;
}
default:
{
if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z')))
{
}
else
{
if (pending)
addPending();
*dest = '<';
dest++;
continue;
}
}
};
if ( pending ) {
if ( pre || script || (!parser->selectMode() &&
(!parser->noSpaces() || dest > buffer ))) {
addPending();
discard = AllDiscard; }
else
pending = NonePending;
}
if (cc == '/' && discard == AllDiscard)
discard = NoneDiscard;
processToken();
cBufferPos = 0;
tag = TagName;
parseTag(src);
}
else if ( cc == '&' && !src.escaped())
{
++src;
if ( pending )
addPending();
parseEntity(src, dest, true);
}
else if ( cc == '<' && !src.escaped())
{
tagStartLineno = lineno+src.lineCount();
++src;
startTag = true;
}
else if (( cc == '\n' ) || ( cc == '\r' ))
{
if (select && !script)
{
if (discard == LFDiscard)
{
discard = NoneDiscard; }
else if(discard == AllDiscard)
{
}
else
{
if (pending == NonePending)
pending = LFPending;
}
}
else {
if (discard == LFDiscard || discard == AllDiscard)
{
discard = NoneDiscard; }
else
{
if (pending)
addPending();
pending = LFPending;
}
}
if (cc == '\r')
{
skipLF = true;
}
++src;
}
else if (( cc == ' ' ) || ( cc == '\t' ))
{
if (select && !script) {
if(discard == SpaceDiscard)
discard = NoneDiscard;
else if(discard == AllDiscard)
{ }
else
pending = SpacePending;
}
else {
if (discard == AllDiscard)
discard = NoneDiscard;
if (pending)
addPending();
if (cc == ' ')
pending = SpacePending;
else
pending = TabPending;
}
++src;
}
else
{
if (pending)
addPending();
discard = NoneDiscard;
if ( pre )
{
prePos++;
}
#if QT_VERSION < 300
unsigned char row = src->row();
if ( row > 0x05 && row < 0x10 || row > 0xfd )
currToken.complexText = true;
#endif
*dest = *src;
fixUpChar( *dest );
++dest;
++src;
}
}
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Ending write at time %d\n", parser->doc()->elapsedTime());
#endif
inWrite = wasInWrite;
if (noMoreData && !inWrite && !loadingExtScript && !m_executingScript && !timerId)
end(); }
void HTMLTokenizer::stopped()
{
if (timerId) {
killTimer(timerId);
timerId = 0;
}
}
bool HTMLTokenizer::processingData() const
{
return timerId != 0;
}
bool HTMLTokenizer::continueProcessing(int& processedCount, const QTime& startTime, const KWQUIEventTime& eventTime)
{
bool allowedYield = allowYield;
allowYield = false;
if (!loadingExtScript && !forceSynchronous && !m_executingScript && (processedCount > TOKENIZER_CHUNK_SIZE || allowedYield)) {
processedCount = 0;
if (startTime.elapsed() > TOKENIZER_TIME_DELAY) {
if (!timerId)
timerId = startTimer(0);
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (eventTime.uiEventPending())
printf("Deferring processing of data because of UI event.\n");
else if (startTime.elapsed() > TOKENIZER_TIME_DELAY)
printf("Deferring processing of data because 200ms elapsed away from event loop.\n");
#endif
return false;
}
}
processedCount++;
return true;
}
void HTMLTokenizer::timerEvent(QTimerEvent* e)
{
if (e->timerId() == timerId) {
killTimer(timerId);
timerId = 0;
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("Beginning timer write at time %d\n", parser->doc()->elapsedTime());
#endif
if (parser->doc()->view() && parser->doc()->view()->layoutPending() && !parser->doc()->minimumLayoutDelay()) {
timerId = startTimer(0);
return;
}
bool oldNoMoreData = noMoreData;
noMoreData = false; write(TokenizerString(), true);
noMoreData = oldNoMoreData;
allDataProcessed();
}
}
void HTMLTokenizer::allDataProcessed()
{
if (noMoreData && !inWrite && !loadingExtScript && !m_executingScript && !onHold && !timerId) {
QGuardedPtr<KHTMLView> savedView = view;
end();
if (savedView) {
KHTMLPart *part = savedView->part();
if (part) {
part->tokenizerProcessedData();
}
}
}
}
void HTMLTokenizer::end()
{
assert(timerId == 0);
if (timerId) {
killTimer(timerId);
timerId = 0;
}
if ( buffer == 0 ) {
parser->finished();
emit finishedParsing();
return;
}
if ( !tag )
processToken();
if(buffer)
KHTML_DELETE_QCHAR_VEC(buffer);
if(scriptCode)
KHTML_DELETE_QCHAR_VEC(scriptCode);
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
buffer = 0;
parser->finished();
emit finishedParsing();
}
void HTMLTokenizer::finish()
{
while((comment || server) && scriptCode && scriptCodeSize)
{
if (comment)
brokenComments = true;
else
brokenServer = true;
checkScriptBuffer();
scriptCode[ scriptCodeSize ] = 0;
scriptCode[ scriptCodeSize + 1 ] = 0;
int pos;
QString food;
if (script || style) {
food.setUnicode(scriptCode, scriptCodeSize);
}
else if (server) {
food = "<";
food += QString(scriptCode, scriptCodeSize);
}
else {
pos = QConstString(scriptCode, scriptCodeSize).string().find('>');
food.setUnicode(scriptCode+pos+1, scriptCodeSize-pos-1); }
KHTML_DELETE_QCHAR_VEC(scriptCode);
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
comment = server = false;
if ( !food.isEmpty() )
write(food, true);
}
noMoreData = true;
if (!inWrite && !loadingExtScript && !m_executingScript && !onHold && !timerId)
end(); }
void HTMLTokenizer::processToken()
{
KJSProxy *jsProxy = (view && view->part()) ? view->part()->jScript() : 0L;
if (jsProxy)
jsProxy->setEventHandlerLineno(tagStartLineno);
if ( dest > buffer )
{
#ifdef TOKEN_DEBUG
if(currToken.id) {
qDebug( "unexpected token id: %d, str: *%s*", currToken.id,QConstString( buffer,dest-buffer ).string().latin1() );
assert(0);
}
#endif
currToken.text = new DOMStringImpl( buffer, dest - buffer );
currToken.text->ref();
if (currToken.id != ID_COMMENT)
currToken.id = ID_TEXT;
}
else if(!currToken.id) {
currToken.reset();
if (jsProxy)
jsProxy->setEventHandlerLineno(lineno+src.lineCount());
return;
}
dest = buffer;
#ifdef TOKEN_DEBUG
QString name = getTagName(currToken.id).string();
QString text;
if(currToken.text)
text = QConstString(currToken.text->s, currToken.text->l).string();
kdDebug( 6036 ) << "Token --> " << name << " id = " << currToken.id << endl;
if (currToken.flat)
kdDebug( 6036 ) << "Token is FLAT!" << endl;
if(!text.isNull())
kdDebug( 6036 ) << "text: \"" << text << "\"" << endl;
unsigned long l = currToken.attrs ? currToken.attrs->length() : 0;
if(l) {
kdDebug( 6036 ) << "Attributes: " << l << endl;
for (unsigned long i = 0; i < l; ++i) {
AttributeImpl* c = currToken.attrs->attributeItem(i);
kdDebug( 6036 ) << " " << c->id() << " " << parser->doc()->getDocument()->attrName(c->id()).string()
<< "=\"" << c->value().string() << "\"" << endl;
}
}
kdDebug( 6036 ) << endl;
#endif
if (!loadStopped) {
parser->parseToken(&currToken);
}
currToken.reset();
if (jsProxy)
jsProxy->setEventHandlerLineno(0);
}
HTMLTokenizer::~HTMLTokenizer()
{
assert(!inWrite);
reset();
delete parser;
}
void HTMLTokenizer::enlargeBuffer(int len)
{
int newsize = kMax(size*2, size+len);
int oldoffs = (dest - buffer);
buffer = (QChar*)realloc(buffer, newsize*sizeof(QChar));
dest = buffer + oldoffs;
size = newsize;
}
void HTMLTokenizer::enlargeScriptBuffer(int len)
{
int newsize = kMax(scriptCodeMaxSize*2, scriptCodeMaxSize+len);
scriptCode = (QChar*)realloc(scriptCode, newsize*sizeof(QChar));
scriptCodeMaxSize = newsize;
}
void HTMLTokenizer::notifyFinished(CachedObject *)
{
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("script loaded at %d\n", parser->doc()->elapsedTime());
#endif
assert(!cachedScript.isEmpty());
bool finished = false;
while (!finished && cachedScript.head()->isLoaded()) {
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "Finished loading an external script" << endl;
#endif
CachedScript* cs = cachedScript.dequeue();
DOMString scriptSource = cs->script();
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "External script is:" << endl << scriptSource.string() << endl;
#endif
setSrc(TokenizerString());
QString cachedScriptUrl( cs->url().string() );
cs->deref(this);
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("external script beginning execution at %d\n", parser->doc()->elapsedTime());
#endif
scriptExecution( scriptSource.string(), cachedScriptUrl );
finished = cachedScript.isEmpty();
if (finished) {
loadingExtScript = false;
#ifdef INSTRUMENT_LAYOUT_SCHEDULING
if (!parser->doc()->ownerElement())
printf("external script finished execution at %d\n", parser->doc()->elapsedTime());
#endif
}
if ( !script ) {
TokenizerString rest = pendingSrc;
pendingSrc.clear();
write(rest, false);
}
}
}
bool HTMLTokenizer::isWaitingForScripts() const
{
return loadingExtScript;
}
void HTMLTokenizer::setSrc(const TokenizerString &source)
{
lineno += src.lineCount();
src = source;
src.resetLineCount();
}
void HTMLTokenizer::setOnHold(bool _onHold)
{
onHold = _onHold;
}
}