YarrCanonicalize.h [plain text]
#pragma once
#include <stdint.h>
#include <unicode/utypes.h>
namespace JSC { namespace Yarr {
enum UCS2CanonicalizationType {
CanonicalizeUnique, CanonicalizeSet, CanonicalizeRangeLo, CanonicalizeRangeHi, CanonicalizeAlternatingAligned, CanonicalizeAlternatingUnaligned, };
struct CanonicalizationRange {
UChar32 begin;
UChar32 end;
UChar32 value;
UCS2CanonicalizationType type;
};
extern const size_t UCS2_CANONICALIZATION_RANGES;
extern const UChar32* const ucs2CharacterSetInfo[];
extern const CanonicalizationRange ucs2RangeInfo[];
extern const size_t UNICODE_CANONICALIZATION_RANGES;
extern const UChar32* const unicodeCharacterSetInfo[];
extern const CanonicalizationRange unicodeRangeInfo[];
enum class CanonicalMode { UCS2, Unicode };
inline const UChar32* canonicalCharacterSetInfo(unsigned index, CanonicalMode canonicalMode)
{
const UChar32* const* rangeInfo = canonicalMode == CanonicalMode::UCS2 ? ucs2CharacterSetInfo : unicodeCharacterSetInfo;
return rangeInfo[index];
}
inline const CanonicalizationRange* canonicalRangeInfoFor(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
{
const CanonicalizationRange* info = canonicalMode == CanonicalMode::UCS2 ? ucs2RangeInfo : unicodeRangeInfo;
size_t entries = canonicalMode == CanonicalMode::UCS2 ? UCS2_CANONICALIZATION_RANGES : UNICODE_CANONICALIZATION_RANGES;
while (true) {
size_t candidate = entries >> 1;
const CanonicalizationRange* candidateInfo = info + candidate;
if (ch < candidateInfo->begin)
entries = candidate;
else if (ch <= candidateInfo->end)
return candidateInfo;
else {
info = candidateInfo + 1;
entries -= (candidate + 1);
}
}
}
inline UChar32 getCanonicalPair(const CanonicalizationRange* info, UChar32 ch)
{
ASSERT(ch >= info->begin && ch <= info->end);
switch (info->type) {
case CanonicalizeRangeLo:
return ch + info->value;
case CanonicalizeRangeHi:
return ch - info->value;
case CanonicalizeAlternatingAligned:
return ch ^ 1;
case CanonicalizeAlternatingUnaligned:
return ((ch - 1) ^ 1) + 1;
default:
RELEASE_ASSERT_NOT_REACHED();
}
RELEASE_ASSERT_NOT_REACHED();
return 0;
}
inline bool isCanonicallyUnique(UChar32 ch, CanonicalMode canonicalMode = CanonicalMode::UCS2)
{
return canonicalRangeInfoFor(ch, canonicalMode)->type == CanonicalizeUnique;
}
inline bool areCanonicallyEquivalent(UChar32 a, UChar32 b, CanonicalMode canonicalMode = CanonicalMode::UCS2)
{
const CanonicalizationRange* info = canonicalRangeInfoFor(a, canonicalMode);
switch (info->type) {
case CanonicalizeUnique:
return a == b;
case CanonicalizeSet: {
for (const UChar32* set = canonicalCharacterSetInfo(info->value, canonicalMode); (a = *set); ++set) {
if (a == b)
return true;
}
return false;
}
case CanonicalizeRangeLo:
return (a == b) || (a + info->value == b);
case CanonicalizeRangeHi:
return (a == b) || (a - info->value == b);
case CanonicalizeAlternatingAligned:
return (a | 1) == (b | 1);
case CanonicalizeAlternatingUnaligned:
return ((a - 1) | 1) == ((b - 1) | 1);
}
RELEASE_ASSERT_NOT_REACHED();
return false;
}
} }