TextBreakIterator.cpp [plain text]
#include "config.h"
#include "TextBreakIterator.h"
#include "LineBreakIteratorPoolICU.h"
#include "UTextProviderLatin1.h"
#include "UTextProviderUTF16.h"
#include <mutex>
#include <wtf/Atomics.h>
#include <wtf/text/StringView.h>
namespace WebCore {
static TextBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID())
{
UErrorCode openStatus = U_ZERO_ERROR;
TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, locale, 0, 0, &openStatus));
ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
return iterator;
}
#if !PLATFORM(IOS)
static TextBreakIterator* initializeIteratorWithRules(const char* breakRules)
{
UParseError parseStatus;
UErrorCode openStatus = U_ZERO_ERROR;
unsigned length = strlen(breakRules);
auto upconvertedCharacters = StringView(reinterpret_cast<const LChar*>(breakRules), length).upconvertedCharacters();
TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(upconvertedCharacters, length, 0, 0, &parseStatus, &openStatus));
ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
return iterator;
}
#endif
static TextBreakIterator* setTextForIterator(TextBreakIterator& iterator, StringView string)
{
if (string.is8Bit()) {
UTextWithBuffer textLocal;
textLocal.text = UTEXT_INITIALIZER;
textLocal.text.extraSize = sizeof(textLocal.buffer);
textLocal.text.pExtra = textLocal.buffer;
UErrorCode openStatus = U_ZERO_ERROR;
UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus);
if (U_FAILURE(openStatus)) {
LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus);
return nullptr;
}
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
if (U_FAILURE(setTextStatus)) {
LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
return nullptr;
}
utext_close(text);
} else {
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setText(reinterpret_cast<UBreakIterator*>(&iterator), string.characters16(), string.length(), &setTextStatus);
if (U_FAILURE(setTextStatus))
return nullptr;
}
return &iterator;
}
static TextBreakIterator* setContextAwareTextForIterator(TextBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength)
{
if (string.is8Bit()) {
UTextWithBuffer textLocal;
textLocal.text = UTEXT_INITIALIZER;
textLocal.text.extraSize = sizeof(textLocal.buffer);
textLocal.text.pExtra = textLocal.buffer;
UErrorCode openStatus = U_ZERO_ERROR;
UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus);
if (U_FAILURE(openStatus)) {
LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus);
return nullptr;
}
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
if (U_FAILURE(setTextStatus)) {
LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
return nullptr;
}
utext_close(text);
} else {
UText textLocal = UTEXT_INITIALIZER;
UErrorCode openStatus = U_ZERO_ERROR;
UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus);
if (U_FAILURE(openStatus)) {
LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus);
return 0;
}
UErrorCode setTextStatus = U_ZERO_ERROR;
ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
if (U_FAILURE(setTextStatus)) {
LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
return nullptr;
}
utext_close(text);
}
return &iterator;
}
TextBreakIterator* wordBreakIterator(StringView string)
{
static TextBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD);
if (!staticWordBreakIterator)
return nullptr;
return setTextForIterator(*staticWordBreakIterator, string);
}
TextBreakIterator* sentenceBreakIterator(StringView string)
{
static TextBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE);
if (!staticSentenceBreakIterator)
return nullptr;
return setTextForIterator(*staticSentenceBreakIterator, string);
}
TextBreakIterator* cursorMovementIterator(StringView string)
{
#if !PLATFORM(IOS)
static const char* kRules =
"$CR = [\\p{Grapheme_Cluster_Break = CR}];"
"$LF = [\\p{Grapheme_Cluster_Break = LF}];"
"$Control = [\\p{Grapheme_Cluster_Break = Control}];"
"$VoiceMarks = [\\uFF9E\\uFF9F];" "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
"$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
"$L = [\\p{Grapheme_Cluster_Break = L}];"
"$V = [\\p{Grapheme_Cluster_Break = V}];"
"$T = [\\p{Grapheme_Cluster_Break = T}];"
"$LV = [\\p{Grapheme_Cluster_Break = LV}];"
"$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
"$Hin0 = [\\u0905-\\u0939];" "$HinV = \\u094D;" "$Hin1 = [\\u0915-\\u0939];" "$Ben0 = [\\u0985-\\u09B9];" "$BenV = \\u09CD;" "$Ben1 = [\\u0995-\\u09B9];" "$Pan0 = [\\u0A05-\\u0A39];" "$PanV = \\u0A4D;" "$Pan1 = [\\u0A15-\\u0A39];" "$Guj0 = [\\u0A85-\\u0AB9];" "$GujV = \\u0ACD;" "$Guj1 = [\\u0A95-\\u0AB9];" "$Ori0 = [\\u0B05-\\u0B39];" "$OriV = \\u0B4D;" "$Ori1 = [\\u0B15-\\u0B39];" "$Tel0 = [\\u0C05-\\u0C39];" "$TelV = \\u0C4D;" "$Tel1 = [\\u0C14-\\u0C39];" "$Kan0 = [\\u0C85-\\u0CB9];" "$KanV = \\u0CCD;" "$Kan1 = [\\u0C95-\\u0CB9];" "$Mal0 = [\\u0D05-\\u0D39];" "$MalV = \\u0D4D;" "$Mal1 = [\\u0D15-\\u0D39];" "$RI = [\\U0001F1E6-\\U0001F1FF];" "!!chain;"
"!!forward;"
"$CR $LF;"
"$L ($L | $V | $LV | $LVT);"
"($LV | $V) ($V | $T);"
"($LVT | $T) $T;"
"[^$Control $CR $LF] $Extend;"
"[^$Control $CR $LF] $SpacingMark;"
"$RI $RI / $RI;"
"$RI $RI;"
"$Hin0 $HinV $Hin1;" "$Ben0 $BenV $Ben1;" "$Pan0 $PanV $Pan1;" "$Guj0 $GujV $Guj1;" "$Ori0 $OriV $Ori1;" "$Tel0 $TelV $Tel1;" "$Kan0 $KanV $Kan1;" "$Mal0 $MalV $Mal1;" "!!reverse;"
"$LF $CR;"
"($L | $V | $LV | $LVT) $L;"
"($V | $T) ($LV | $V);"
"$T ($LVT | $T);"
"$Extend [^$Control $CR $LF];"
"$SpacingMark [^$Control $CR $LF];"
"$RI $RI / $RI $RI;"
"$RI $RI;"
"$Hin1 $HinV $Hin0;" "$Ben1 $BenV $Ben0;" "$Pan1 $PanV $Pan0;" "$Guj1 $GujV $Guj0;" "$Ori1 $OriV $Ori0;" "$Tel1 $TelV $Tel0;" "$Kan1 $KanV $Kan0;" "$Mal1 $MalV $Mal0;" "!!safe_reverse;"
"!!safe_forward;";
static TextBreakIterator* staticCursorMovementIterator = initializeIteratorWithRules(kRules);
#else // PLATFORM(IOS)
static TextBreakIterator* staticCursorMovementIterator = initializeIterator(UBRK_CHARACTER, "th");
#endif // !PLATFORM(IOS)
if (!staticCursorMovementIterator)
return nullptr;
return setTextForIterator(*staticCursorMovementIterator, string);
}
TextBreakIterator* acquireLineBreakIterator(StringView string, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
{
TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(LineBreakIteratorPool::sharedPool().take(locale));
if (!iterator)
return nullptr;
return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength);
}
void releaseLineBreakIterator(TextBreakIterator* iterator)
{
ASSERT_ARG(iterator, iterator);
LineBreakIteratorPool::sharedPool().put(reinterpret_cast<UBreakIterator*>(iterator));
}
static TextBreakIterator* nonSharedCharacterBreakIterator;
static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
{
#if ENABLE(COMPARE_AND_SWAP)
return WTF::weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), expected, newValue);
#else
DEPRECATED_DEFINE_STATIC_LOCAL(std::mutex, nonSharedCharacterBreakIteratorMutex, ());
std::lock_guard<std::mutex> locker(nonSharedCharacterBreakIteratorMutex);
if (nonSharedCharacterBreakIterator != expected)
return false;
nonSharedCharacterBreakIterator = newValue;
return true;
#endif
}
NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string)
{
m_iterator = nonSharedCharacterBreakIterator;
bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
if (!createdIterator)
m_iterator = initializeIterator(UBRK_CHARACTER);
if (!m_iterator)
return;
m_iterator = setTextForIterator(*m_iterator, string);
}
NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
{
if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
ubrk_close(reinterpret_cast<UBreakIterator*>(m_iterator));
}
int textBreakFirst(TextBreakIterator* iterator)
{
return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
}
int textBreakLast(TextBreakIterator* iterator)
{
return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
}
int textBreakNext(TextBreakIterator* iterator)
{
return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
}
int textBreakPrevious(TextBreakIterator* iterator)
{
return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
}
int textBreakPreceding(TextBreakIterator* iterator, int pos)
{
return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
}
int textBreakFollowing(TextBreakIterator* iterator, int pos)
{
return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
}
int textBreakCurrent(TextBreakIterator* iterator)
{
return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
}
bool isTextBreak(TextBreakIterator* iterator, int position)
{
return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
}
bool isWordTextBreak(TextBreakIterator* iterator)
{
int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator));
return ruleStatus != UBRK_WORD_NONE;
}
unsigned numGraphemeClusters(const String& s)
{
unsigned stringLength = s.length();
if (!stringLength)
return 0;
if (s.is8Bit() && !s.contains('\r'))
return stringLength;
NonSharedCharacterBreakIterator it(s);
if (!it)
return stringLength;
unsigned num = 0;
while (textBreakNext(it) != TextBreakDone)
++num;
return num;
}
unsigned numCharactersInGraphemeClusters(const String& s, unsigned numGraphemeClusters)
{
unsigned stringLength = s.length();
if (!stringLength)
return 0;
if (s.is8Bit() && !s.contains('\r'))
return std::min(stringLength, numGraphemeClusters);
NonSharedCharacterBreakIterator it(s);
if (!it)
return std::min(stringLength, numGraphemeClusters);
for (unsigned i = 0; i < numGraphemeClusters; ++i) {
if (textBreakNext(it) == TextBreakDone)
return stringLength;
}
return textBreakCurrent(it);
}
}