#include "config.h"
#include "URLHelpers.h"
#include "URLParser.h"
#include <mutex>
#include <unicode/uidna.h>
#include <unicode/uscript.h>
#include <wtf/Optional.h>
#include <wtf/text/WTFString.h>
namespace WTF {
namespace URLHelpers {
constexpr unsigned hostNameBufferLength = 2048;
constexpr unsigned urlBytesBufferLength = 2048;
constexpr auto scriptCodeLimit = static_cast<UScriptCode>(256);
static uint32_t allowedIDNScriptBits[(scriptCodeLimit + 31) / 32];
#if !PLATFORM(COCOA)
void loadIDNAllowedScriptList()
{
static std::once_flag flag;
std::call_once(flag, initializeDefaultIDNAllowedScriptList);
}
#endif // !PLATFORM(COCOA)
static bool isArmenianLookalikeCharacter(UChar32 codePoint)
{
return codePoint == 0x0548 || codePoint == 0x054D || codePoint == 0x0578 || codePoint == 0x057D;
}
static bool isArmenianScriptCharacter(UChar32 codePoint)
{
UErrorCode error = U_ZERO_ERROR;
UScriptCode script = uscript_getScript(codePoint, &error);
if (error != U_ZERO_ERROR) {
LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
return false;
}
return script == USCRIPT_ARMENIAN;
}
template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode)
{
if (!isASCIIDigitOrPunctuation(charCode))
return false;
switch (charCode) {
case '#':
case '%':
case '/':
case ':':
case '?':
case '@':
case '[':
case '\\':
case ']':
return false;
default:
return true;
}
}
static bool isLookalikeCharacter(const Optional<UChar32>& previousCodePoint, UChar32 charCode)
{
if (!u_isprint(charCode) || u_isUWhiteSpace(charCode) || u_hasBinaryProperty(charCode, UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
return true;
switch (charCode) {
case 0x00BC:
case 0x00BD:
case 0x00BE:
case 0x00ED:
case 0x01C0:
case 0x01C3:
case 0x0251:
case 0x0261:
case 0x027E:
case 0x02D0:
case 0x0335:
case 0x0337:
case 0x0338:
case 0x0589:
case 0x05B4:
case 0x05BC:
case 0x05C3:
case 0x05F4:
case 0x0609:
case 0x060A:
case 0x0650:
case 0x0660:
case 0x066A:
case 0x06D4:
case 0x06F0:
case 0x0701:
case 0x0702:
case 0x0703:
case 0x0704:
case 0x1735:
case 0x1D04:
case 0x1D0F:
case 0x1D1C:
case 0x1D20:
case 0x1D21:
case 0x1D22:
case 0x1ECD:
case 0x2010:
case 0x2011:
case 0x2024:
case 0x2027:
case 0x2039:
case 0x203A:
case 0x2041:
case 0x2044:
case 0x2052:
case 0x2153:
case 0x2154:
case 0x2155:
case 0x2156:
case 0x2157:
case 0x2158:
case 0x2159:
case 0x215A:
case 0x215B:
case 0x215C:
case 0x215D:
case 0x215E:
case 0x215F:
case 0x2212:
case 0x2215:
case 0x2216:
case 0x2236:
case 0x233F:
case 0x23AE:
case 0x244A:
case 0x2571:
case 0x2572:
case 0x29F6:
case 0x29F8:
case 0x2AFB:
case 0x2AFD:
case 0x2FF0:
case 0x2FF1:
case 0x2FF2:
case 0x2FF3:
case 0x2FF4:
case 0x2FF5:
case 0x2FF6:
case 0x2FF7:
case 0x2FF8:
case 0x2FF9:
case 0x2FFA:
case 0x2FFB:
case 0x3002:
case 0x3008:
case 0x3014:
case 0x3015:
case 0x3033:
case 0x3035:
case 0x321D:
case 0x321E:
case 0x33AE:
case 0x33AF:
case 0x33C6:
case 0x33DF:
case 0x05B9:
case 0x05BA:
case 0x05C1:
case 0x05C2:
case 0x05C4:
case 0xA731:
case 0xA771:
case 0xA789:
case 0xFE14:
case 0xFE15:
case 0xFE3F:
case 0xFE5D:
case 0xFE5E:
case 0xFF0E:
case 0xFF0F:
case 0xFF61:
case 0xFFFC:
case 0xFFFD:
case 0x1F50F:
case 0x1F510:
case 0x1F511:
case 0x1F512:
case 0x1F513:
return true;
case 0x0307:
return previousCodePoint == 0x0237
|| previousCodePoint == 0x0131
|| previousCodePoint == 0x05D5;
case 0x0548:
case 0x054D:
case 0x0578:
case 0x057D:
return previousCodePoint
&& !isASCIIDigitOrValidHostCharacter(previousCodePoint.value())
&& !isArmenianScriptCharacter(previousCodePoint.value());
case '.':
return false;
default:
return previousCodePoint
&& isArmenianLookalikeCharacter(previousCodePoint.value())
&& !(isArmenianScriptCharacter(charCode) || isASCIIDigitOrValidHostCharacter(charCode));
}
}
static void addScriptToIDNAllowedScriptList(int32_t script)
{
if (script >= 0 && script < scriptCodeLimit) {
size_t index = script / 32;
uint32_t mask = 1 << (script % 32);
allowedIDNScriptBits[index] |= mask;
}
}
static void addScriptToIDNAllowedScriptList(UScriptCode script)
{
addScriptToIDNAllowedScriptList(static_cast<int32_t>(script));
}
void addScriptToIDNAllowedScriptList(const char* scriptName)
{
addScriptToIDNAllowedScriptList(u_getPropertyValueEnum(UCHAR_SCRIPT, scriptName));
}
void initializeDefaultIDNAllowedScriptList()
{
constexpr UScriptCode scripts[] = {
USCRIPT_COMMON,
USCRIPT_INHERITED,
USCRIPT_ARABIC,
USCRIPT_ARMENIAN,
USCRIPT_BOPOMOFO,
USCRIPT_CANADIAN_ABORIGINAL,
USCRIPT_DEVANAGARI,
USCRIPT_DESERET,
USCRIPT_GUJARATI,
USCRIPT_GURMUKHI,
USCRIPT_HANGUL,
USCRIPT_HAN,
USCRIPT_HEBREW,
USCRIPT_HIRAGANA,
USCRIPT_KATAKANA_OR_HIRAGANA,
USCRIPT_KATAKANA,
USCRIPT_LATIN,
USCRIPT_TAMIL,
USCRIPT_THAI,
USCRIPT_YI,
};
for (auto script : scripts)
addScriptToIDNAllowedScriptList(script);
}
static bool allCharactersInAllowedIDNScriptList(const UChar* buffer, int32_t length)
{
loadIDNAllowedScriptList();
int32_t i = 0;
Optional<UChar32> previousCodePoint;
while (i < length) {
UChar32 c;
U16_NEXT(buffer, i, length, c);
UErrorCode error = U_ZERO_ERROR;
UScriptCode script = uscript_getScript(c, &error);
if (error != U_ZERO_ERROR) {
LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
return false;
}
if (script < 0) {
LOG_ERROR("got negative number for script code from ICU: %d", script);
return false;
}
if (script >= scriptCodeLimit)
return false;
size_t index = script / 32;
uint32_t mask = 1 << (script % 32);
if (!(allowedIDNScriptBits[index] & mask))
return false;
if (isLookalikeCharacter(previousCodePoint, c))
return false;
previousCodePoint = c;
}
return true;
}
template<typename Func>
static inline bool isSecondLevelDomainNameAllowedByTLDRules(const UChar* buffer, int32_t length, Func characterIsAllowed)
{
ASSERT(length > 0);
for (int32_t i = length - 1; i >= 0; --i) {
UChar ch = buffer[i];
if (characterIsAllowed(ch))
continue;
if (ch == '.')
break;
return false;
}
return true;
}
#define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, function) \
{ \
static const int32_t suffixLength = sizeof(suffix) / sizeof(suffix[0]); \
if (length > suffixLength && !memcmp(buffer + length - suffixLength, suffix, sizeof(suffix))) \
return isSecondLevelDomainNameAllowedByTLDRules(buffer, length - suffixLength, function); \
}
static bool isRussianDomainNameCharacter(UChar ch)
{
return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || isASCIIDigit(ch) || ch == '-';
}
static bool allCharactersAllowedByTLDRules(const UChar* buffer, int32_t length)
{
if (buffer[length - 1] == '.')
length--;
static const UChar cyrillicRF[] = {
'.',
0x0440, 0x0444, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRF, isRussianDomainNameCharacter);
static const UChar cyrillicRUS[] = {
'.',
0x0440, 0x0443, 0x0441, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicRUS, isRussianDomainNameCharacter);
static const UChar cyrillicMOSKVA[] = {
'.',
0x043C, 0x043E, 0x0441, 0x043A, 0x0432, 0x0430, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMOSKVA, isRussianDomainNameCharacter);
static const UChar cyrillicDETI[] = {
'.',
0x0434, 0x0435, 0x0442, 0x0438, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicDETI, isRussianDomainNameCharacter);
static const UChar cyrillicONLAYN[] = {
'.',
0x043E, 0x043D, 0x043B, 0x0430, 0x0439, 0x043D, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicONLAYN, isRussianDomainNameCharacter);
static const UChar cyrillicSAYT[] = {
'.',
0x0441, 0x0430, 0x0439, 0x0442, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSAYT, isRussianDomainNameCharacter);
static const UChar cyrillicORG[] = {
'.',
0x043E, 0x0440, 0x0433, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicORG, isRussianDomainNameCharacter);
static const UChar cyrillicBEL[] = {
'.',
0x0431, 0x0435, 0x043B, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBEL, [](UChar ch) {
return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0456 || ch == 0x045E || ch == 0x2019 || isASCIIDigit(ch) || ch == '-';
});
static const UChar cyrillicKAZ[] = {
'.',
0x049B, 0x0430, 0x0437, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicKAZ, [](UChar ch) {
return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04D9 || ch == 0x0493 || ch == 0x049B || ch == 0x04A3 || ch == 0x04E9 || ch == 0x04B1 || ch == 0x04AF || ch == 0x04BB || ch == 0x0456 || isASCIIDigit(ch) || ch == '-';
});
static const UChar cyrillicUKR[] = {
'.',
0x0443, 0x043A, 0x0440, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicUKR, [](UChar ch) {
return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0491 || ch == 0x0404 || ch == 0x0456 || ch == 0x0457 || isASCIIDigit(ch) || ch == '-';
});
static const UChar cyrillicSRB[] = {
'.',
0x0441, 0x0440, 0x0431, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicSRB, [](UChar ch) {
return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0452 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045B || ch == 0x045F || isASCIIDigit(ch) || ch == '-';
});
static const UChar cyrillicMKD[] = {
'.',
0x043C, 0x043A, 0x0434, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMKD, [](UChar ch) {
return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0453 || ch == 0x0455 || ch == 0x0458 || ch == 0x0459 || ch == 0x045A || ch == 0x045C || ch == 0x045F || isASCIIDigit(ch) || ch == '-';
});
static const UChar cyrillicMON[] = {
'.',
0x043C, 0x043E, 0x043D, };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicMON, [](UChar ch) {
return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04E9 || ch == 0x04AF || isASCIIDigit(ch) || ch == '-';
});
static const UChar cyrillicBG[] = {
'.',
0x0431, 0x0433 };
CHECK_RULES_IF_SUFFIX_MATCHES(cyrillicBG, [](UChar ch) {
return (ch >= 0x0430 && ch <= 0x044A) || ch == 0x044C || (ch >= 0x044E && ch <= 0x0450) || ch == 0x045D || isASCIIDigit(ch) || ch == '-';
});
return false;
}
Optional<String> mapHostName(const String& hostName, URLDecodeFunction decodeFunction)
{
if (hostName.length() > hostNameBufferLength)
return String();
if (!hostName.length())
return String();
String string;
if (decodeFunction && string.contains('%'))
string = (*decodeFunction)(hostName);
else
string = hostName;
unsigned length = string.length();
auto sourceBuffer = string.charactersWithNullTermination();
UChar destinationBuffer[hostNameBufferLength];
UErrorCode uerror = U_ZERO_ERROR;
UIDNAInfo processingDetails = UIDNA_INFO_INITIALIZER;
int32_t numCharactersConverted = (decodeFunction ? uidna_nameToASCII : uidna_nameToUnicode)(&URLParser::internationalDomainNameTranscoder(), sourceBuffer.data(), length, destinationBuffer, hostNameBufferLength, &processingDetails, &uerror);
int allowedErrors = decodeFunction ? 0 : UIDNA_ERROR_EMPTY_LABEL | UIDNA_ERROR_LEADING_HYPHEN | UIDNA_ERROR_TRAILING_HYPHEN | UIDNA_ERROR_HYPHEN_3_4;
if (length && (U_FAILURE(uerror) || processingDetails.errors & ~allowedErrors))
return nullopt;
if (numCharactersConverted == static_cast<int32_t>(length) && !memcmp(sourceBuffer.data(), destinationBuffer, length * sizeof(UChar)))
return String();
if (!decodeFunction && !allCharactersInAllowedIDNScriptList(destinationBuffer, numCharactersConverted) && !allCharactersAllowedByTLDRules(destinationBuffer, numCharactersConverted))
return String();
return String(destinationBuffer, numCharactersConverted);
}
using MappingRangesVector = Optional<Vector<std::tuple<unsigned, unsigned, String>>>;
static void collectRangesThatNeedMapping(const String& string, unsigned location, unsigned length, MappingRangesVector& array, URLDecodeFunction decodeFunction)
{
String substring = string.substringSharingImpl(location, length);
Optional<String> host = mapHostName(substring, decodeFunction);
if (host && !*host)
return;
if (!array)
array = Vector<std::tuple<unsigned, unsigned, String>>();
if (host)
array->constructAndAppend(location, length, *host);
}
static void applyHostNameFunctionToMailToURLString(const String& string, URLDecodeFunction decodeFunction, MappingRangesVector& array)
{
unsigned stringLength = string.length();
unsigned current = 0;
while (1) {
auto hostNameOrStringStart = string.find([](UChar ch) {
return ch == '"' || ch == '@' || ch == '?';
}, current);
if (hostNameOrStringStart == notFound)
return;
UChar c = string[hostNameOrStringStart];
current = hostNameOrStringStart + 1;
if (c == '?')
return;
if (c == '@') {
unsigned hostNameStart = current;
auto hostNameEnd = string.find([](UChar ch) {
return ch == '>' || ch == ',' || ch == '?';
}, current);
bool done;
if (hostNameEnd == notFound) {
hostNameEnd = stringLength;
done = true;
} else {
current = hostNameEnd;
done = false;
}
collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
if (done)
return;
} else {
ASSERT(c == '"');
while (1) {
auto escapedCharacterOrStringEnd = string.find([](UChar ch) {
return ch == '"' || ch == '\\';
}, current);
if (escapedCharacterOrStringEnd == notFound)
return;
c = string[escapedCharacterOrStringEnd];
current = escapedCharacterOrStringEnd + 1;
if (c == '"')
break;
ASSERT(c == '\\');
if (current == stringLength)
return;
++current;
}
}
}
}
static void applyHostNameFunctionToURLString(const String& string, URLDecodeFunction decodeFunction, MappingRangesVector& array)
{
if (protocolIs(string, "mailto")) {
applyHostNameFunctionToMailToURLString(string, decodeFunction, array);
return;
}
static const char* separator = "://";
auto separatorIndex = string.find(separator);
if (separatorIndex == notFound)
return;
unsigned authorityStart = separatorIndex + strlen(separator);
if (StringView { string }.left(separatorIndex).contains([](UChar character) {
return !(isASCIIAlphanumeric(character) || character == '+' || character == '-' || character == '.');
}))
return;
auto hostNameTerminator = string.find([](UChar character) {
return character == ':' || character == '/' || character == '?' || character == '#';
}, authorityStart);
unsigned hostNameEnd = hostNameTerminator == notFound ? string.length() : hostNameTerminator;
auto userInfoTerminator = StringView { string }.left(hostNameEnd).find('@', authorityStart);
unsigned hostNameStart = userInfoTerminator == notFound ? authorityStart : userInfoTerminator + 1;
collectRangesThatNeedMapping(string, hostNameStart, hostNameEnd - hostNameStart, array, decodeFunction);
}
String mapHostNames(const String& string, URLDecodeFunction decodeFunction)
{
if (decodeFunction && string.isAllASCII())
return string;
MappingRangesVector hostNameRanges;
applyHostNameFunctionToURLString(string, decodeFunction, hostNameRanges);
if (!hostNameRanges)
return string;
if (hostNameRanges->isEmpty())
return { };
String result = string;
while (!hostNameRanges->isEmpty()) {
auto [location, length, mappedHostName] = hostNameRanges->takeLast();
result = result.replace(location, length, mappedHostName);
}
return result;
}
static String escapeUnsafeCharacters(const String& sourceBuffer)
{
unsigned length = sourceBuffer.length();
Optional<UChar32> previousCodePoint;
unsigned i;
for (i = 0; i < length; ) {
UChar32 c = sourceBuffer.characterStartingAt(i);
if (isLookalikeCharacter(previousCodePoint, sourceBuffer.characterStartingAt(i)))
break;
previousCodePoint = c;
i += U16_LENGTH(c);
}
if (i == length)
return sourceBuffer;
Vector<UChar, urlBytesBufferLength> outBuffer;
outBuffer.grow(i);
if (sourceBuffer.is8Bit())
StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters8(), i);
else
StringImpl::copyCharacters(outBuffer.data(), sourceBuffer.characters16(), i);
for (; i < length; ) {
UChar32 c = sourceBuffer.characterStartingAt(i);
unsigned characterLength = U16_LENGTH(c);
if (isLookalikeCharacter(previousCodePoint, c)) {
uint8_t utf8Buffer[4];
size_t offset = 0;
UBool failure = false;
U8_APPEND(utf8Buffer, offset, 4, c, failure);
ASSERT_UNUSED(failure, !failure);
for (size_t j = 0; j < offset; ++j) {
outBuffer.append('%');
outBuffer.append(upperNibbleToASCIIHexDigit(utf8Buffer[j]));
outBuffer.append(lowerNibbleToASCIIHexDigit(utf8Buffer[j]));
}
} else {
for (unsigned j = 0; j < characterLength; ++j)
outBuffer.append(sourceBuffer[i + j]);
}
previousCodePoint = c;
i += characterLength;
}
return String::adopt(WTFMove(outBuffer));
}
String userVisibleURL(const CString& url)
{
auto* before = reinterpret_cast<const unsigned char*>(url.data());
int length = url.length();
if (!length)
return { };
bool mayNeedHostNameDecoding = false;
Checked<int, RecordOverflow> bufferLength = length;
bufferLength = bufferLength * 3 + 1; if (bufferLength.hasOverflowed())
return { };
Vector<char, urlBytesBufferLength> after(bufferLength.unsafeGet());
char* q = after.data();
{
const unsigned char* p = before;
for (int i = 0; i < length; i++) {
unsigned char c = p[i];
if (c == '%' && i + 2 < length && isASCIIHexDigit(p[i + 1]) && isASCIIHexDigit(p[i + 2])) {
auto u = toASCIIHexValue(p[i + 1], p[i + 2]);
if (u > 0x7f) {
*q++ = u;
} else {
*q++ = p[i];
*q++ = p[i + 1];
*q++ = p[i + 2];
}
i += 2;
} else {
*q++ = c;
if (c == '-' && i >= 3 && !mayNeedHostNameDecoding && (q[-4] | 0x20) == 'x' && (q[-3] | 0x20) == 'n' && q[-2] == '-')
mayNeedHostNameDecoding = true;
}
}
*q = '\0';
}
String result = String::fromUTF8(after.data());
if (!result) {
int afterlength = q - after.data();
char* p = after.data() + bufferLength.unsafeGet() - afterlength - 1;
memmove(p, after.data(), afterlength + 1); char* q = after.data();
while (*p) {
unsigned char c = *p;
if (c > 0x7f) {
*q++ = '%';
*q++ = upperNibbleToASCIIHexDigit(c);
*q++ = lowerNibbleToASCIIHexDigit(c);
} else
*q++ = *p;
p++;
}
*q = '\0';
result = String::fromUTF8(after.data());
ASSERT(!!result);
}
if (mayNeedHostNameDecoding) {
auto mappedResult = mapHostNames(result, nullptr);
if (!!mappedResult)
result = mappedResult;
}
return escapeUnsafeCharacters(normalizedNFC(result));
}
} }