#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "rbtok.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "rbbidata.h"
U_NAMESPACE_BEGIN
#if defined(__GNUC__) && (__GNUC__ >= 4)
#pragma GCC optimization_level 3
#endif
static const int16_t START_STATE = 1; static const int16_t STOP_STATE = 0;
int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
{
RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
RuleBasedTokenRange *outTokenP = outTokenRanges;
int32_t state;
int16_t category;
const RBBIStateTableRow *row;
const RBBIStateTableRow *const startRow = fStartRow;
int32_t lastAcceptingState = 0;
UChar32 c = 0;
signed long prev;
signed long result;
const char *const tableData = fData->fForwardTable->fTableData;
const uint32_t tableRowLen = fData->fForwardTable->fRowLen;
UText *text = fText;
#ifdef RBBI_DEBUG
if (fTrace) {
RBBIDebugPuts("Handle Next pos char state category");
}
#endif
fLastStatusIndexValid = FALSE;
prev = (signed long)UTEXT_GETNATIVEINDEX(text);
const UTrie *trie = &fData->fTrie;
while (outTokenP < outTokenLimit) {
c = UTEXT_NEXT32(text);
if (c == U_SENTINEL)
{
goto exitTokenizer;
}
state = START_STATE;
row = startRow;
if (fCachedBreakPositions != NULL) {
if (fPositionInCache < fNumCachedBreakPositions - 1) {
++fPositionInCache;
result = fCachedBreakPositions[fPositionInCache];
goto emitToken;
}
else {
reset();
}
}
while (c != U_SENTINEL) {
if (__builtin_expect((c < 0x100), 1))
category = fLatin1Cat[c];
else
UTRIE_GET16(trie, c, category);
if (__builtin_expect((category & 0x4000) != 0, 0)) {
fDictionaryCharCount++;
category &= ~0x4000;
}
#ifdef RBBI_DEBUG
if (fTrace) {
RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText));
if (0x20<=c && c<0x7f) {
RBBIDebugPrintf("\"%c\" ", c);
} else {
RBBIDebugPrintf("%5x ", c);
}
RBBIDebugPrintf("%3d %3d\n", state, category);
}
#endif
state = row->fNextState[category];
row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
if (row->fAccepting == -1) {
result = (signed long)UTEXT_GETNATIVEINDEX(text);
lastAcceptingState = state;
}
if (state == STOP_STATE) {
break;
}
c = UTEXT_NEXT32(text);
}
if (fDictionaryCharCount > 0) {
result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
}
emitToken:
UTEXT_SETNATIVEINDEX(text, result);
RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
int32_t flags = fStateFlags[lastAcceptingState];
if (flags == -1)
goto skipToken;
*outTokenP++ = range;
if (outTokenFlags)
{
*outTokenFlags++ = (unsigned long) flags;
}
if (flags & 0x40000000)
goto exitTokenizer;
skipToken:
prev = result;
}
exitTokenizer:
return (outTokenP - outTokenRanges);
}
#if defined (__GNUC__) && (__GNUC__ >= 4)
#pragma GCC optimization_level reset
#endif
void
RuleBasedTokenizer::init()
{
const RBBIStateTable *statetable = fData->fForwardTable;
setBreakType(UBRK_WORD);
fStartRow = (const RBBIStateTableRow *)
(statetable->fTableData + (statetable->fRowLen * START_STATE));
UChar i;
const UTrie *trie = &fData->fTrie;
int16_t category;
fLatin1Cat = new int16_t[256];
for (i = 0; i < 256; ++i)
{
fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
}
fStateFlags = new int32_t[statetable->fNumStates];
for (i = 0; i < statetable->fNumStates; ++i)
{
const RBBIStateTableRow *row = (const RBBIStateTableRow *)
(statetable->fTableData + (statetable->fRowLen * i));
int32_t flags = 0;
if (row->fAccepting == -1)
{
const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
const int32_t *valLimit = vals + 1;
valLimit += *vals++;
while (vals < valLimit)
{
int32_t val = *vals++;
if (val == 0)
{
break;
}
else if (val > 0)
{
flags |= val;
}
else
{
flags = val;
break;
}
}
}
fStateFlags[i] = flags;
}
}
RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
: RuleBasedBreakIterator(rules, parseErr, err)
{
init();
}
RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
: RuleBasedBreakIterator((RBBIDataHeader *)data, status)
{
init();
}
RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
: RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
{
init();
}
RuleBasedTokenizer::~RuleBasedTokenizer() {
delete [] fStateFlags;
delete [] fLatin1Cat;
}
U_NAMESPACE_END
#endif