DecodeEscapeSequences.h [plain text]
#pragma once
#include "TextEncoding.h"
#include <wtf/ASCIICType.h>
#include <wtf/Assertions.h>
#include <wtf/text/StringBuilder.h>
namespace WebCore {
struct Unicode16BitEscapeSequence {
enum { sequenceSize = 6 }; static size_t findInString(StringView string, size_t startPosition) { return string.find(StringView("%u"), startPosition); }
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
{
size_t runEnd = startPosition;
while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
&& isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
&& isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
runEnd += sequenceSize;
}
return runEnd;
}
static String decodeRun(StringView run, const TextEncoding&)
{
auto numberOfSequences = run.length() / sequenceSize;
StringBuilder builder;
builder.reserveCapacity(numberOfSequences);
while (numberOfSequences--) {
UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
builder.append(codeUnit);
run = run.substring(sequenceSize);
}
return builder.toString();
}
};
struct URLEscapeSequence {
enum { sequenceSize = 3 }; static size_t findInString(StringView string, size_t startPosition) { return string.find('%', startPosition); }
static size_t findEndOfRun(StringView string, size_t startPosition, size_t endPosition)
{
size_t runEnd = startPosition;
int numberOfTrailingCharacters = 0;
while (runEnd < endPosition) {
if (string[runEnd] == '%') {
if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
runEnd += sequenceSize;
numberOfTrailingCharacters = 0;
} else
break;
} else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
runEnd += 1;
numberOfTrailingCharacters += 1;
} else
break;
}
return runEnd;
}
static Vector<char, 512> decodeRun(StringView run)
{
Vector<char, 512> buffer;
buffer.grow(run.length()); char* p = buffer.data();
while (!run.isEmpty()) {
if (run[0] == '%') {
*p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
run = run.substring(sequenceSize);
} else {
*p++ = run[0];
run = run.substring(1);
}
}
ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); buffer.shrink(p - buffer.data());
return buffer;
}
static String decodeRun(StringView run, const TextEncoding& encoding)
{
auto buffer = decodeRun(run);
if (!encoding.isValid())
return UTF8Encoding().decode(buffer.data(), buffer.size());
return encoding.decode(buffer.data(), buffer.size());
}
};
template<typename EscapeSequence>
String decodeEscapeSequences(StringView string, const TextEncoding& encoding)
{
StringBuilder result;
size_t length = string.length();
size_t decodedPosition = 0;
size_t searchPosition = 0;
size_t encodedRunPosition;
while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
searchPosition = encodedRunEnd;
if (encodedRunEnd == encodedRunPosition) {
++searchPosition;
continue;
}
String decoded = EscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
if (decoded.isEmpty())
continue;
result.append(string.substring(decodedPosition, encodedRunPosition - decodedPosition));
result.append(decoded);
decodedPosition = encodedRunEnd;
}
result.append(string.substring(decodedPosition, length - decodedPosition));
return result.toString();
}
inline Vector<uint8_t> decodeURLEscapeSequencesAsData(StringView string, const TextEncoding& encoding)
{
ASSERT(encoding.isValid());
Vector<uint8_t> result;
size_t decodedPosition = 0;
size_t searchPosition = 0;
while (true) {
size_t encodedRunPosition = URLEscapeSequence::findInString(string, searchPosition);
size_t encodedRunEnd = 0;
if (encodedRunPosition != notFound) {
encodedRunEnd = URLEscapeSequence::findEndOfRun(string, encodedRunPosition, string.length());
searchPosition = encodedRunEnd;
if (encodedRunEnd == encodedRunPosition) {
++searchPosition;
continue;
}
}
result.appendVector(encoding.encode(string.substring(decodedPosition, encodedRunPosition - decodedPosition), UnencodableHandling::URLEncodedEntities));
if (encodedRunPosition == notFound)
return result;
auto decodedEscapeSequence = URLEscapeSequence::decodeRun(string.substring(encodedRunPosition, encodedRunEnd - encodedRunPosition));
ASSERT(!decodedEscapeSequence.isEmpty());
result.appendVector(decodedEscapeSequence);
decodedPosition = encodedRunEnd;
}
}
}