khtml_text_operations.cpp [plain text]
#include "khtml_text_operations.h"
#include <misc/htmltags.h>
#include <rendering/render_text.h>
#include <xml/dom_nodeimpl.h>
using DOM::DOMString;
using DOM::Node;
using DOM::NodeImpl;
using DOM::Range;
namespace khtml {
const unsigned short nonBreakingSpace = 0xA0;
class TextIterator
{
public:
TextIterator();
explicit TextIterator(const DOM::Range &);
bool atEnd() const { return !m_positionNode; }
void advance();
long textLength() const { return m_textLength; }
const QChar *textCharacters() const { return m_textCharacters; }
DOM::Range position() const;
private:
void exitNode();
bool handleTextNode();
bool handleReplacedElement();
bool handleNonTextNode();
void handleTextBox();
void emitCharacter(QChar, DOM::NodeImpl *textNode, long textStartOffset, long textEndOffset);
DOM::NodeImpl *m_node;
long m_offset;
bool m_handledNode;
bool m_handledChildren;
DOM::NodeImpl *m_endNode;
long m_endOffset;
DOM::NodeImpl *m_positionNode;
long m_positionStartOffset;
long m_positionEndOffset;
const QChar *m_textCharacters;
long m_textLength;
bool m_needAnotherNewline;
InlineTextBox *m_textBox;
DOM::NodeImpl *m_lastTextNode;
bool m_lastTextNodeEndedWithCollapsedSpace;
QChar m_lastCharacter;
QChar m_singleCharacterBuffer;
};
class CharacterIterator {
public:
CharacterIterator();
explicit CharacterIterator(const DOM::Range &r);
void advance(long numCharacters);
bool atBreak() const { return m_atBreak; }
bool atEnd() const { return m_textIterator.atEnd(); }
long numCharacters() const { return m_textIterator.textLength() - m_runOffset; }
const QChar *characters() const { return m_textIterator.textCharacters() + m_runOffset; }
long characterOffset() const { return m_offset; }
Range position() const;
private:
long m_offset;
long m_runOffset;
bool m_atBreak;
TextIterator m_textIterator;
};
class CircularSearchBuffer {
public:
CircularSearchBuffer(const QString &target, bool isCaseSensitive);
~CircularSearchBuffer() { free(m_buffer); }
void clear() { m_cursor = m_buffer; m_bufferFull = false; }
void append(long length, const QChar *characters);
void append(const QChar &);
long neededCharacters() const;
bool isMatch() const;
long length() const { return m_target.length(); }
private:
QString m_target;
bool m_isCaseSensitive;
QChar *m_buffer;
QChar *m_cursor;
bool m_bufferFull;
CircularSearchBuffer(const CircularSearchBuffer&);
CircularSearchBuffer &operator=(const CircularSearchBuffer&);
};
TextIterator::TextIterator() : m_positionNode(0)
{
}
inline bool offsetInCharacters(unsigned short type)
{
switch (type) {
case Node::ATTRIBUTE_NODE:
case Node::DOCUMENT_FRAGMENT_NODE:
case Node::DOCUMENT_NODE:
case Node::ELEMENT_NODE:
case Node::ENTITY_REFERENCE_NODE:
return false;
case Node::CDATA_SECTION_NODE:
case Node::COMMENT_NODE:
case Node::PROCESSING_INSTRUCTION_NODE:
case Node::TEXT_NODE:
return true;
case Node::DOCUMENT_TYPE_NODE:
case Node::ENTITY_NODE:
case Node::NOTATION_NODE:
assert(false); return false;
}
assert(false); return false;
}
TextIterator::TextIterator(const Range &r)
{
if (r.isNull()) {
m_positionNode = 0;
return;
}
NodeImpl *startNode = r.startContainer().handle();
NodeImpl *endNode = r.endContainer().handle();
long startOffset = r.startOffset();
long endOffset = r.endOffset();
if (!offsetInCharacters(startNode->nodeType())) {
if (startOffset >= 0 && startOffset < static_cast<long>(startNode->childNodeCount())) {
startNode = startNode->childNode(startOffset);
startOffset = 0;
}
}
if (!offsetInCharacters(endNode->nodeType())) {
if (endOffset > 0 && endOffset <= static_cast<long>(endNode->childNodeCount())) {
endNode = endNode->childNode(endOffset - 1);
endOffset = LONG_MAX;
}
}
m_node = startNode;
m_offset = startOffset;
m_handledNode = false;
m_handledChildren = false;
m_endNode = endNode;
m_endOffset = endOffset;
m_needAnotherNewline = false;
m_textBox = 0;
m_lastTextNode = 0;
m_lastTextNodeEndedWithCollapsedSpace = false;
m_lastCharacter = '\n';
#ifndef NDEBUG
m_positionNode = startNode;
#endif
advance();
}
void TextIterator::advance()
{
assert(m_positionNode);
m_positionNode = 0;
m_textLength = 0;
if (m_needAnotherNewline) {
long offset = m_node->nodeIndex();
emitCharacter('\n', m_node->parentNode(), offset + 1, offset + 1);
m_needAnotherNewline = false;
return;
}
if (m_textBox) {
handleTextBox();
if (m_positionNode) {
return;
}
}
while (m_node) {
if (!m_handledNode) {
RenderObject *renderer = m_node->renderer();
if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
if (renderer->style()->visibility() == VISIBLE) {
m_handledNode = handleTextNode();
}
} else if (renderer && (renderer->isImage() || renderer->isWidget())) {
if (renderer->style()->visibility() == VISIBLE) {
m_handledNode = handleReplacedElement();
}
} else {
m_handledNode = handleNonTextNode();
}
if (m_positionNode) {
return;
}
}
NodeImpl *next = m_handledChildren ? 0 : m_node->firstChild();
m_offset = 0;
if (!next && m_node != m_endNode) {
next = m_node->nextSibling();
while (!next && m_node->parentNode()) {
m_node = m_node->parentNode();
exitNode();
if (m_positionNode) {
m_handledNode = true;
m_handledChildren = true;
return;
}
if (m_node == m_endNode) {
break;
}
next = m_node->nextSibling();
}
}
m_node = next;
m_handledNode = false;
m_handledChildren = false;
if (m_positionNode) {
return;
}
}
}
bool TextIterator::handleTextNode()
{
m_lastTextNode = m_node;
RenderText *renderer = static_cast<RenderText *>(m_node->renderer());
DOMString str = m_node->nodeValue();
if (renderer->style()->whiteSpace() == khtml::PRE) {
long runStart = m_offset;
if (m_lastTextNodeEndedWithCollapsedSpace) {
emitCharacter(' ', m_node, runStart, runStart);
return false;
}
long strLength = str.length();
long end = (m_node == m_endNode) ? m_endOffset : LONG_MAX;
long runEnd = kMin(strLength, end);
m_positionNode = m_node;
m_positionStartOffset = runStart;
m_positionEndOffset = runEnd;
m_textCharacters = str.unicode() + runStart;
m_textLength = runEnd - runStart;
m_lastCharacter = str[runEnd - 1];
return true;
}
if (!renderer->firstTextBox() && str.length() > 0) {
m_lastTextNodeEndedWithCollapsedSpace = true; return true;
}
m_textBox = renderer->firstTextBox();
handleTextBox();
return true;
}
void TextIterator::handleTextBox()
{
RenderText *renderer = static_cast<RenderText *>(m_node->renderer());
DOMString str = m_node->nodeValue();
long start = m_offset;
long end = (m_node == m_endNode) ? m_endOffset : LONG_MAX;
for (; m_textBox; m_textBox = m_textBox->nextTextBox()) {
long textBoxStart = m_textBox->m_start;
long runStart = kMax(textBoxStart, start);
bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
|| (m_textBox == renderer->firstTextBox() && textBoxStart == runStart && runStart > 0);
if (needSpace && !m_lastCharacter.isSpace()) {
emitCharacter(' ', m_node, runStart, runStart);
return;
}
long textBoxEnd = textBoxStart + m_textBox->m_len;
long runEnd = kMin(textBoxEnd, end);
if (runStart < runEnd) {
if (str[runStart] == '\n') {
emitCharacter(' ', m_node, runStart, runStart + 1);
m_offset = runStart + 1;
} else {
long subrunEnd = str.find('\n', runStart);
if (subrunEnd == -1 || subrunEnd > runEnd) {
subrunEnd = runEnd;
}
m_offset = subrunEnd;
m_positionNode = m_node;
m_positionStartOffset = runStart;
m_positionEndOffset = subrunEnd;
m_textCharacters = str.unicode() + runStart;
m_textLength = subrunEnd - runStart;
m_lastCharacter = str[subrunEnd - 1];
}
if (m_positionEndOffset < runEnd) {
return;
}
InlineTextBox *nextTextBox = m_textBox->nextTextBox();
long nextRunStart = nextTextBox ? nextTextBox->m_start : str.length();
if (nextRunStart > runEnd) {
m_lastTextNodeEndedWithCollapsedSpace = true; }
m_textBox = nextTextBox;
return;
}
}
}
bool TextIterator::handleReplacedElement()
{
if (m_lastTextNodeEndedWithCollapsedSpace) {
long offset = m_lastTextNode->nodeIndex();
emitCharacter(' ', m_lastTextNode->parentNode(), offset + 1, offset + 1);
return false;
}
long offset = m_node->nodeIndex();
m_positionNode = m_node->parentNode();
m_positionStartOffset = offset;
m_positionEndOffset = offset + 1;
m_textCharacters = 0;
m_textLength = 0;
m_lastCharacter = 0;
return true;
}
bool TextIterator::handleNonTextNode()
{
switch (m_node->id()) {
case ID_BR: {
long offset = m_node->nodeIndex();
emitCharacter('\n', m_node->parentNode(), offset, offset + 1);
break;
}
case ID_TD:
case ID_TH:
if (m_lastCharacter != '\n' && m_lastTextNode) {
long offset = m_lastTextNode->nodeIndex();
emitCharacter('\t', m_lastTextNode->parentNode(), offset, offset + 1);
}
break;
case ID_BLOCKQUOTE:
case ID_DD:
case ID_DIV:
case ID_DL:
case ID_DT:
case ID_H1:
case ID_H2:
case ID_H3:
case ID_H4:
case ID_H5:
case ID_H6:
case ID_HR:
case ID_LI:
case ID_OL:
case ID_P:
case ID_PRE:
case ID_TR:
case ID_UL:
if (m_lastCharacter != '\n' && m_lastTextNode) {
long offset = m_lastTextNode->nodeIndex();
emitCharacter('\n', m_lastTextNode->parentNode(), offset, offset + 1);
}
break;
}
return true;
}
void TextIterator::exitNode()
{
bool endLine = false;
bool addNewline = false;
switch (m_node->id()) {
case ID_BLOCKQUOTE:
case ID_DD:
case ID_DIV:
case ID_DL:
case ID_DT:
case ID_HR:
case ID_LI:
case ID_OL:
case ID_PRE:
case ID_TR:
case ID_UL:
endLine = true;
break;
case ID_H1:
case ID_H2:
case ID_H3:
case ID_H4:
case ID_H5:
case ID_H6:
case ID_P:
endLine = true;
addNewline = true;
break;
}
if (endLine && m_lastCharacter != '\n' && m_lastTextNode) {
long offset = m_lastTextNode->nodeIndex();
emitCharacter('\n', m_lastTextNode->parentNode(), offset, offset + 1);
m_needAnotherNewline = addNewline;
} else if (addNewline && m_lastTextNode) {
long offset = m_node->childNodeCount();
emitCharacter('\n', m_node, offset, offset);
}
}
void TextIterator::emitCharacter(QChar c, NodeImpl *textNode, long textStartOffset, long textEndOffset)
{
m_singleCharacterBuffer = c;
m_positionNode = textNode;
m_positionStartOffset = textStartOffset;
m_positionEndOffset = textEndOffset;
m_textCharacters = &m_singleCharacterBuffer;
m_textLength = 1;
m_lastTextNodeEndedWithCollapsedSpace = false;
m_lastCharacter = c;
}
Range TextIterator::position() const
{
assert(m_positionNode);
return Range(m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
}
CharacterIterator::CharacterIterator()
: m_offset(0), m_runOffset(0), m_atBreak(true)
{
}
CharacterIterator::CharacterIterator(const Range &r)
: m_offset(0), m_runOffset(0), m_atBreak(true), m_textIterator(r)
{
while (!atEnd() && m_textIterator.textLength() == 0) {
m_textIterator.advance();
}
}
Range CharacterIterator::position() const
{
Range r = m_textIterator.position();
if (m_textIterator.textLength() <= 1) {
assert(m_runOffset == 0);
} else {
Node n = r.startContainer();
assert(n == r.endContainer());
long offset = r.startOffset() + m_runOffset;
r.setStart(n, offset);
r.setEnd(n, offset + 1);
}
return r;
}
void CharacterIterator::advance(long count)
{
assert(!atEnd());
m_atBreak = false;
long remaining = m_textIterator.textLength() - m_runOffset;
if (count < remaining) {
m_runOffset += count;
m_offset += count;
return;
}
count -= remaining;
m_offset += remaining;
for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
long runLength = m_textIterator.textLength();
if (runLength == 0) {
m_atBreak = true;
} else {
if (count < runLength) {
m_runOffset = count;
m_offset += count;
return;
}
count -= runLength;
m_offset += runLength;
}
}
m_atBreak = true;
m_runOffset = 0;
}
CircularSearchBuffer::CircularSearchBuffer(const QString &s, bool isCaseSensitive)
: m_target(s)
{
assert(!s.isEmpty());
if (!isCaseSensitive) {
m_target = s.lower();
}
m_target.replace(nonBreakingSpace, ' ');
m_isCaseSensitive = isCaseSensitive;
m_buffer = static_cast<QChar *>(malloc(s.length() * sizeof(QChar)));
m_cursor = m_buffer;
m_bufferFull = false;
}
void CircularSearchBuffer::append(const QChar &c)
{
if (m_isCaseSensitive) {
*m_cursor++ = c.unicode() == nonBreakingSpace ? ' ' : c.unicode();
} else {
*m_cursor++ = c.unicode() == nonBreakingSpace ? ' ' : c.lower().unicode();
}
if (m_cursor == m_buffer + length()) {
m_cursor = m_buffer;
m_bufferFull = true;
}
}
void CircularSearchBuffer::append(long count, const QChar *characters)
{
long tailSpace = m_buffer + length() - m_cursor;
assert(!m_bufferFull);
assert(count <= tailSpace);
if (m_isCaseSensitive) {
for (long i = 0; i != count; ++i) {
QChar c = characters[i];
m_cursor[i] = c.unicode() == nonBreakingSpace ? ' ' : c.unicode();
}
} else {
for (long i = 0; i != count; ++i) {
QChar c = characters[i];
m_cursor[i] = c.unicode() == nonBreakingSpace ? ' ' : c.lower().unicode();
}
}
if (count < tailSpace) {
m_cursor += count;
} else {
m_bufferFull = true;
m_cursor = m_buffer;
}
}
long CircularSearchBuffer::neededCharacters() const
{
return m_bufferFull ? 0 : m_buffer + length() - m_cursor;
}
bool CircularSearchBuffer::isMatch() const
{
assert(m_bufferFull);
long headSpace = m_cursor - m_buffer;
long tailSpace = length() - headSpace;
return memcmp(m_cursor, m_target.unicode(), tailSpace * sizeof(QChar)) == 0
&& memcmp(m_buffer, m_target.unicode() + tailSpace, headSpace * sizeof(QChar)) == 0;
}
QString plainText(const Range &r)
{
long length = 0;
for (TextIterator it(r); !it.atEnd(); it.advance()) {
length += it.textLength();
}
QString result("");
result.reserve(length);
for (TextIterator it(r); !it.atEnd(); it.advance()) {
result.append(it.textCharacters(), it.textLength());
}
return result;
}
Range findPlainText(const Range &r, const QString &s, bool forward, bool caseSensitive)
{
if (s.isEmpty() || s.find('\n') != -1) {
Range result = r;
result.collapse(forward);
return result;
}
CircularSearchBuffer buffer(s, caseSensitive);
bool found = false;
CharacterIterator rangeEnd;
{
CharacterIterator it(r);
while (!it.atEnd()) {
while (long needed = buffer.neededCharacters()) {
long available = it.numCharacters();
long runLength = kMin(needed, available);
buffer.append(runLength, it.characters());
it.advance(runLength);
if (it.atBreak()) {
if (it.atEnd()) {
goto done;
}
buffer.clear();
}
}
do {
if (buffer.isMatch()) {
found = true;
rangeEnd = it;
if (forward) {
goto done;
}
}
buffer.append(it.characters()[0]);
it.advance(1);
} while (!it.atBreak());
buffer.clear();
}
}
done:
Range result = r;
if (!found) {
result.collapse(!forward);
} else {
CharacterIterator it(r);
it.advance(rangeEnd.characterOffset() - buffer.length());
result.setStart(it.position().startContainer(), it.position().startOffset());
it.advance(buffer.length() - 1);
result.setEnd(it.position().endContainer(), it.position().endOffset());
}
return result;
}
}