TextBreakIteratorICU.h [plain text]
#pragma once
#include <unicode/ubrk.h>
#include <wtf/Optional.h>
#include <wtf/text/icu/UTextProviderLatin1.h>
#define USE_ICU_CARET_ITERATOR (PLATFORM(MAC) && __MAC_OS_X_VERSION_MIN_REQUIRED < 101200)
namespace WTF {
#if USE_ICU_CARET_ITERATOR
static String caretRules()
{
static StaticStringImpl caretRuleString(
"$CR = [\\p{Grapheme_Cluster_Break = CR}];"
"$LF = [\\p{Grapheme_Cluster_Break = LF}];"
"$Control = [\\p{Grapheme_Cluster_Break = Control}];"
"$VoiceMarks = [\\uFF9E\\uFF9F];" "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
"$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
"$L = [\\p{Grapheme_Cluster_Break = L}];"
"$V = [\\p{Grapheme_Cluster_Break = V}];"
"$T = [\\p{Grapheme_Cluster_Break = T}];"
"$LV = [\\p{Grapheme_Cluster_Break = LV}];"
"$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
"$Hin0 = [\\u0905-\\u0939];" "$HinV = \\u094D;" "$Hin1 = [\\u0915-\\u0939];" "$Ben0 = [\\u0985-\\u09B9];" "$BenV = \\u09CD;" "$Ben1 = [\\u0995-\\u09B9];" "$Pan0 = [\\u0A05-\\u0A39];" "$PanV = \\u0A4D;" "$Pan1 = [\\u0A15-\\u0A39];" "$Guj0 = [\\u0A85-\\u0AB9];" "$GujV = \\u0ACD;" "$Guj1 = [\\u0A95-\\u0AB9];" "$Ori0 = [\\u0B05-\\u0B39];" "$OriV = \\u0B4D;" "$Ori1 = [\\u0B15-\\u0B39];" "$Tel0 = [\\u0C05-\\u0C39];" "$TelV = \\u0C4D;" "$Tel1 = [\\u0C14-\\u0C39];" "$Kan0 = [\\u0C85-\\u0CB9];" "$KanV = \\u0CCD;" "$Kan1 = [\\u0C95-\\u0CB9];" "$Mal0 = [\\u0D05-\\u0D39];" "$MalV = \\u0D4D;" "$Mal1 = [\\u0D15-\\u0D39];" "$RI = [\\U0001F1E6-\\U0001F1FF];" "$ZWJ = \\u200D;" "$EmojiVar = [\\uFE0F];" "$EmojiForSeqs = [\\u2640 \\u2642 \\u26F9 \\u2764 \\U0001F308 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA-\\U0001F3CC \\U0001F3F3 \\U0001F441 \\U0001F466-\\U0001F469 \\U0001F46E-\\U0001F46F \\U0001F471 \\U0001F473 \\U0001F477 \\U0001F481-\\U0001F482 \\U0001F486-\\U0001F487 \\U0001F48B \\U0001F575 \\U0001F5E8 \\U0001F645-\\U0001F647 \\U0001F64B \\U0001F64D-\\U0001F64E \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\u2695-\\u2696 \\u2708 \\U0001F33E \\U0001F373 \\U0001F393 \\U0001F3A4 \\U0001F3A8 \\U0001F3EB \\U0001F3ED \\U0001F4BB-\\U0001F4BC \\U0001F527 \\U0001F52C \\U0001F680 \\U0001F692 \\U0001F926 \\U0001F937-\\U0001F939 \\U0001F93C-\\U0001F93E];" "$EmojiForMods = [\\u261D \\u26F9 \\u270A-\\u270D \\U0001F385 \\U0001F3C3-\\U0001F3C4 \\U0001F3CA \\U0001F3CB \\U0001F442-\\U0001F443 \\U0001F446-\\U0001F450 \\U0001F466-\\U0001F478 \\U0001F47C \\U0001F481-\\U0001F483 \\U0001F485-\\U0001F487 \\U0001F4AA \\U0001F575 \\U0001F590 \\U0001F595 \\U0001F596 \\U0001F645-\\U0001F647 \\U0001F64B-\\U0001F64F \\U0001F6A3 \\U0001F6B4-\\U0001F6B6 \\U0001F6C0 \\U0001F918 \\U0001F3C2 \\U0001F3C7 \\U0001F3CC \\U0001F574 \\U0001F57A \\U0001F6CC \\U0001F919-\\U0001F91E \\U0001F926 \\U0001F930 \\U0001F933-\\U0001F939 \\U0001F93C-\\U0001F93E] ;" "$EmojiMods = [\\U0001F3FB-\\U0001F3FF];" "!!chain;"
"!!RINoChain;"
"!!forward;"
"$CR $LF;"
"$L ($L | $V | $LV | $LVT);"
"($LV | $V) ($V | $T);"
"($LVT | $T) $T;"
"$RI $RI $Extend* / $RI;"
"$RI $RI $Extend*;"
"[^$Control $CR $LF] $Extend;"
"[^$Control $CR $LF] $SpacingMark;"
"$Hin0 $HinV $Hin1;" "$Ben0 $BenV $Ben1;" "$Pan0 $PanV $Pan1;" "$Guj0 $GujV $Guj1;" "$Ori0 $OriV $Ori1;" "$Tel0 $TelV $Tel1;" "$Kan0 $KanV $Kan1;" "$Mal0 $MalV $Mal1;" "$ZWJ $EmojiForSeqs;" "$EmojiForMods $EmojiVar? $EmojiMods;" "!!reverse;"
"$LF $CR;"
"($L | $V | $LV | $LVT) $L;"
"($V | $T) ($LV | $V);"
"$T ($LVT | $T);"
"$Extend* $RI $RI / $Extend* $RI $RI;"
"$Extend* $RI $RI;"
"$Extend [^$Control $CR $LF];"
"$SpacingMark [^$Control $CR $LF];"
"$Hin1 $HinV $Hin0;" "$Ben1 $BenV $Ben0;" "$Pan1 $PanV $Pan0;" "$Guj1 $GujV $Guj0;" "$Ori1 $OriV $Ori0;" "$Tel1 $TelV $Tel0;" "$Kan1 $KanV $Kan0;" "$Mal1 $MalV $Mal0;" "$EmojiForSeqs $ZWJ;" "$EmojiMods $EmojiVar? $EmojiForMods;" "!!safe_reverse;"
"$RI $RI+;"
"[$EmojiVar $EmojiMods]+ $EmojiForMods;"
"!!safe_forward;"
"$RI $RI+;"
"$EmojiForMods [$EmojiVar $EmojiMods]+;"
);
return caretRuleString;
}
#endif
class TextBreakIteratorICU {
public:
enum class Mode {
Line,
Character,
#if USE_ICU_CARET_ITERATOR
Caret,
#endif
};
void set8BitText(const LChar* buffer, unsigned length)
{
UTextWithBuffer textLocal;
textLocal.text = UTEXT_INITIALIZER;
textLocal.text.extraSize = sizeof(textLocal.buffer);
textLocal.text.pExtra = textLocal.buffer;
UErrorCode status = U_ZERO_ERROR;
UText* text = openLatin1UTextProvider(&textLocal, buffer, length, &status);
ASSERT(U_SUCCESS(status));
ASSERT(text);
ubrk_setUText(m_iterator, text, &status);
ASSERT(U_SUCCESS(status));
utext_close(text);
}
TextBreakIteratorICU(StringView string, Mode mode, const char *locale)
{
UBreakIteratorType type;
switch (mode) {
case Mode::Line:
type = UBRK_LINE;
break;
case Mode::Character:
type = UBRK_CHARACTER;
break;
#if USE_ICU_CARET_ITERATOR
case Mode::Caret:
type = UBRK_CHARACTER;
break;
#endif
default:
ASSERT_NOT_REACHED();
type = UBRK_CHARACTER;
break;
}
bool requiresSet8BitText = string.is8Bit();
const UChar *text = requiresSet8BitText ? nullptr : string.characters16();
int32_t textLength = requiresSet8BitText ? 0 : string.length();
UErrorCode status = U_ZERO_ERROR;
#if USE_ICU_CARET_ITERATOR
if (mode == Mode::Caret) {
static NeverDestroyed<String> caretRules = WTF::caretRules();
static NeverDestroyed<StringView::UpconvertedCharacters> upconvertedRules = StringView(caretRules).upconvertedCharacters();
UParseError parseError;
m_iterator = ubrk_openRules(upconvertedRules.get(), caretRules.get().length(), text, textLength, &parseError, &status);
} else
#endif
m_iterator = ubrk_open(type, locale, text, textLength, &status);
ASSERT(U_SUCCESS(status));
if (requiresSet8BitText)
set8BitText(string.characters8(), string.length());
}
TextBreakIteratorICU() = delete;
TextBreakIteratorICU(const TextBreakIteratorICU&) = delete;
TextBreakIteratorICU(TextBreakIteratorICU&& other)
: m_iterator(other.m_iterator)
{
other.m_iterator = nullptr;
}
TextBreakIteratorICU& operator=(const TextBreakIteratorICU&) = delete;
TextBreakIteratorICU& operator=(TextBreakIteratorICU&& other)
{
if (m_iterator)
ubrk_close(m_iterator);
m_iterator = other.m_iterator;
other.m_iterator = nullptr;
return *this;
}
~TextBreakIteratorICU()
{
if (m_iterator)
ubrk_close(m_iterator);
}
void setText(StringView string)
{
if (string.is8Bit()) {
set8BitText(string.characters8(), string.length());
return;
}
UErrorCode status = U_ZERO_ERROR;
ubrk_setText(m_iterator, string.characters16(), string.length(), &status);
ASSERT(U_SUCCESS(status));
}
std::optional<unsigned> preceding(unsigned location) const
{
auto result = ubrk_preceding(m_iterator, location);
if (result == UBRK_DONE)
return { };
return result;
}
std::optional<unsigned> following(unsigned location) const
{
auto result = ubrk_following(m_iterator, location);
if (result == UBRK_DONE)
return { };
return result;
}
bool isBoundary(unsigned location) const
{
return ubrk_isBoundary(m_iterator, location);
}
private:
UBreakIterator* m_iterator;
};
}