#include "unicode/utypes.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/utypes.h"
#include "unicode/brkiter.h"
#include "unicode/rbbi.h"
#include "unicode/uchar.h"
#include "unicode/utf16.h"
#include "unicode/ucnv.h"
#include "unicode/schriter.h"
#include "unicode/uniset.h"
#include "unicode/regex.h" // TODO: make conditional on regexp being built.
#include "unicode/ustring.h"
#include "intltest.h"
#include "rbbitst.h"
#include <string.h>
#include "uvector.h"
#include "uvectr32.h"
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
class BITestData {
public:
UnicodeString fDataToBreak;
UVector fExpectedBreakPositions;
UVector fExpectedTags;
UVector fLineNum;
UVector fActualBreakPositions; UVector fActualTags;
BITestData(UErrorCode &status);
void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
void checkResults(const char *heading, RBBITest *test);
void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
void clearResults();
};
BITestData::BITestData(UErrorCode &status)
: fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
fActualTags(status)
{
};
#define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
if (U_FAILURE(status)) {return;}
if (data != NULL) {
fDataToBreak.append(CharsToUnicodeString(data));
}
fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
fExpectedTags.addElement(tag, status);
fLineNum.addElement(lineNum, status);
};
void BITestData::checkResults(const char *heading, RBBITest *test) {
int32_t expectedIndex = 0;
int32_t actualIndex = 0;
for (;;) {
if (expectedIndex >= fExpectedBreakPositions.size() &&
actualIndex >= fActualBreakPositions.size()) {
break;
}
if (expectedIndex >= fExpectedBreakPositions.size()) {
err(heading, test, expectedIndex-1, actualIndex);
actualIndex++;
continue;
}
if (actualIndex >= fActualBreakPositions.size()) {
err(heading, test, expectedIndex, actualIndex-1);
expectedIndex++;
continue;
}
if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
err(heading, test, expectedIndex, actualIndex);
if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
actualIndex++;
} else {
expectedIndex++;
}
continue;
}
if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
heading, fLineNum.elementAt(expectedIndex),
fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
}
actualIndex++;
expectedIndex++;
}
}
void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
{
int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
int32_t actual = fActualBreakPositions.elementAti(actualIdx);
int32_t o = 0;
int32_t line = fLineNum.elementAti(expectedIdx);
if (expectedIdx > 0) {
o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
}
if (actual < expected) {
test->errln("%s unexpected break at offset %d in test item from line %d", heading, o, line);
} else {
test->errln("%s Failed to find break at end of item from line %d", heading, line);
}
}
void BITestData::clearResults() {
fActualBreakPositions.removeAllElements();
fActualTags.removeAllElements();
}
static const UChar cannedTestArray[] = {
0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
};
static UnicodeString* cannedTestChars = 0;
#define halfNA "\\u0928\\u094d\\u200d"
#define halfSA "\\u0938\\u094d\\u200d"
#define halfCHA "\\u091a\\u094d\\u200d"
#define halfKA "\\u0915\\u094d\\u200d"
#define deadTA "\\u0924\\u094d"
RBBITest::RBBITest() {
UnicodeString temp(cannedTestArray);
cannedTestChars = new UnicodeString();
*cannedTestChars += (UChar)0x0000;
*cannedTestChars += temp;
}
RBBITest::~RBBITest() {
delete cannedTestChars;
}
static const int T_NUMBER = 100;
static const int T_LETTER = 200;
static const int T_H_OR_K = 300;
static const int T_IDEO = 400;
#define deadRA "\\u0930\\u094d"
#define deadPHA "\\u092b\\u094d"
#define deadTTHA "\\u0920\\u094d"
#define deadPA "\\u092a\\u094d"
#define deadSA "\\u0938\\u094d"
#define visarga "\\u0903"
void RBBITest::TestStatusReturn() {
UnicodeString rulesString1 = "$Letters = [:L:];\n"
"$Numbers = [:N:];\n"
"$Letters+{1};\n"
"$Numbers+{2};\n"
"Help\\ {4}/me\\!;\n"
"[^$Letters $Numbers];\n"
"!.*;\n";
UnicodeString testString1 = "abc123..abc Help me Help me!";
int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
UErrorCode status=U_ZERO_ERROR;
UParseError parseError;
RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
if(U_FAILURE(status)) {
errln("FAIL : in construction");
} else {
int32_t pos;
int32_t i = 0;
bi->setText(testString1);
for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
if (pos != bounds1[i]) {
errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
break;
}
int tag = bi->getRuleStatus();
if (tag != brkStatus[i]) {
errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
break;
}
i++;
}
}
delete bi;
}
static void printStringBreaks(UnicodeString ustr, int expected[],
int expectedcount)
{
UErrorCode status = U_ZERO_ERROR;
char name[100];
printf("code alpha extend alphanum type line name\n");
int j;
for (j = 0; j < ustr.length(); j ++) {
if (expectedcount > 0) {
int k;
for (k = 0; k < expectedcount; k ++) {
if (j == expected[k]) {
printf("------------------------------------------------ %d\n",
j);
}
}
}
UChar32 c = ustr.char32At(j);
if (c > 0xffff) {
j ++;
}
u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
printf("%7x %5d %6d %8d %4s %4s %s\n", (int)c,
u_isUAlphabetic(c),
u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
u_isalnum(c),
u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
u_charType(c),
U_SHORT_PROPERTY_NAME),
u_getPropertyValueName(UCHAR_LINE_BREAK,
u_getIntPropertyValue(c,
UCHAR_LINE_BREAK),
U_SHORT_PROPERTY_NAME),
name);
}
}
void RBBITest::TestThaiLineBreak() {
UErrorCode status = U_ZERO_ERROR;
BITestData thaiLineSelection(status);
ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
return;
}
generalIteratorTest(*e, thaiLineSelection);
delete e;
}
void RBBITest::TestMixedThaiLineBreak()
{
UErrorCode status = U_ZERO_ERROR;
BITestData thaiLineSelection(status);
ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
"\\u0e23\\u0e38\\u0e48\\u0e19", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34.", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e22.", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e35\\u0e49", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e32\\u0e04\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "$200", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e48\\u0e32", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19 ", 0, status);
ADD_DATACHUNK(thaiLineSelection, "(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\").", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
return;
}
generalIteratorTest(*e, thaiLineSelection);
delete e;
}
void RBBITest::TestMaiyamok()
{
UErrorCode status = U_ZERO_ERROR;
BITestData thaiLineSelection(status);
ADD_DATACHUNK(thaiLineSelection, NULL, 0, status); ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07", 0, status);
ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
return;
}
generalIteratorTest(*e, thaiLineSelection);
delete e;
}
void RBBITest::TestThaiWordBreak() {
UErrorCode status = U_ZERO_ERROR;
BITestData thaiWordSelection(status);
ADD_DATACHUNK(thaiWordSelection, NULL, 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E1A\\u0E17", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E35\\u0E48", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E51", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E1E\\u0E32\\u0E22\\u0E38", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u000D\\u000A", 0, status);
ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E14", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22", 0, status);
ADD_DATACHUNK(thaiWordSelection, "\\u0E2D\\u0E22\\u0E39\\u0E48", 0, status);
ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07", 0, status);
ADD_DATACHUNK(thaiWordSelection, "\\u0E17\\u0E38\\u0E48\\u0E07", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E2B\\u0E0D\\u0E48", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E43\\u0E19", 0, status);
ADD_DATACHUNK(thaiWordSelection, "\\u0E41\\u0E04\\u0E19", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A", 0, status);
ADD_DATACHUNK(thaiWordSelection, "\\u0E25\\u0E38\\u0E07", 0, status);
ADD_DATACHUNK(thaiWordSelection, "\\u0E40\\u0E2E", 0, status); ADD_DATACHUNK(thaiWordSelection, "\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
Locale("th"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
return;
}
generalIteratorTest(*e, thaiWordSelection);
delete e;
}
void RBBITest::TestBug3818() {
UErrorCode status = U_ZERO_ERROR;
static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
UnicodeString thaiStr(thaiWordData);
RuleBasedBreakIterator* bi =
(RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
if (U_FAILURE(status) || bi == NULL) {
errln("Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
return;
}
bi->setText(thaiStr);
int32_t startOfSecondWord = bi->following(1);
if (startOfSecondWord != 4) {
errln("Fail at file %s, line %d expected start of word at 4, got %d",
__FILE__, __LINE__, startOfSecondWord);
}
startOfSecondWord = bi->following(0);
if (startOfSecondWord != 4) {
errln("Fail at file %s, line %d expected start of word at 4, got %d",
__FILE__, __LINE__, startOfSecondWord);
}
delete bi;
}
void RBBITest::TestJapaneseWordBreak() {
UErrorCode status = U_ZERO_ERROR;
BITestData japaneseWordSelection(status);
ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status); ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status);
RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
Locale("ja"), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
return;
}
generalIteratorTest(*e, japaneseWordSelection);
delete e;
}
void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
{
if (exec) logln("TestSuite RuleBasedBreakIterator: ");
switch (index) {
case 0: name = "TestBug4153072";
if(exec) TestBug4153072(); break;
case 1: name = "TestJapaneseLineBreak";
if(exec) TestJapaneseLineBreak(); break;
case 2: name = "TestStatusReturn";
if(exec) TestStatusReturn(); break;
case 3: name = "TestLineBreakData";
if(exec) TestLineBreakData(); break;
case 4: name = "TestEmptyString";
if(exec) TestEmptyString(); break;
case 5: name = "TestGetAvailableLocales";
if(exec) TestGetAvailableLocales(); break;
case 6: name = "TestGetDisplayName";
if(exec) TestGetDisplayName(); break;
case 7: name = "TestEndBehaviour";
if(exec) TestEndBehaviour(); break;
case 8: name = "TestMixedThaiLineBreak";
if(exec) TestMixedThaiLineBreak(); break;
case 9: name = "TestThaiWordBreak";
if(exec) TestThaiWordBreak(); break;
case 10: name = "TestThaiLineBreak";
if(exec) TestThaiLineBreak(); break;
case 11: name = "TestMaiyamok";
if(exec) TestMaiyamok(); break;
case 12: name = "TestWordBreaks";
if(exec) TestWordBreaks(); break;
case 13: name = "TestWordBoundary";
if(exec) TestWordBoundary(); break;
case 14: name = "TestLineBreaks";
if(exec) TestLineBreaks(); break;
case 15: name = "TestSentBreaks";
if(exec) TestSentBreaks(); break;
case 16: name = "TestExtended";
if(exec) TestExtended(); break;
case 17: name = "TestMonkey";
if(exec) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
TestMonkey(params);
#else
logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
#endif
}
break;
case 18: name = "TestBug3818";
if(exec) TestBug3818(); break;
case 19: name = "TestJapaneseWordBreak";
if(exec) TestJapaneseWordBreak(); break;
default: name = ""; break; }
}
void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
{
bi.setText(td.fDataToBreak);
testFirstAndNext(bi, td);
testLastAndPrevious(bi, td);
testFollowing(bi, td);
testPreceding(bi, td);
testIsBoundary(bi, td);
doMultipleSelectionTest(bi, td);
}
void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
{
UErrorCode status = U_ZERO_ERROR;
int32_t p;
int32_t lastP = -1;
int32_t tag;
logln("Test first and next");
bi.setText(td.fDataToBreak);
td.clearResults();
for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
td.fActualBreakPositions.addElement(p, status); tag = bi.getRuleStatus();
td.fActualTags.addElement(tag, status);
if (p <= lastP) {
break;
}
lastP = p;
}
td.checkResults("testFirstAndNext", this);
}
void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
{
UErrorCode status = U_ZERO_ERROR;
int32_t p;
int32_t lastP = 0x7ffffffe;
int32_t tag;
logln("Test first and next");
bi.setText(td.fDataToBreak);
td.clearResults();
for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
td.fActualBreakPositions.insertElementAt(p, 0, status);
tag = bi.getRuleStatus();
td.fActualTags.insertElementAt(tag, 0, status);
if (p >= lastP) {
break;
}
lastP = p;
}
td.checkResults("testLastAndPrevious", this);
}
void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
{
UErrorCode status = U_ZERO_ERROR;
int32_t p;
int32_t tag;
int32_t lastP = -2; int i;
logln("testFollowing():");
bi.setText(td.fDataToBreak);
td.clearResults();
p = bi.first();
td.fActualBreakPositions.addElement(p, status); tag = bi.getRuleStatus();
td.fActualTags.addElement(tag, status);
for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
p = bi.following(i);
if (p != lastP) {
if (p == RuleBasedBreakIterator::DONE) {
break;
}
td.fActualBreakPositions.addElement(p, status); tag = bi.getRuleStatus();
td.fActualTags.addElement(tag, status);
lastP = p;
}
}
if (i != td.fDataToBreak.length()) {
errln("testFollowing(): iterator returned DONE prematurely.");
}
td.checkResults("testFollowing", this);
}
void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
UErrorCode status = U_ZERO_ERROR;
int32_t p;
int32_t tag;
int32_t lastP = 0x7ffffffe;
int i;
logln("testPreceding():");
bi.setText(td.fDataToBreak);
td.clearResults();
p = bi.last();
td.fActualBreakPositions.addElement(p, status);
tag = bi.getRuleStatus();
td.fActualTags.addElement(tag, status);
for (i = td.fDataToBreak.length(); i>=-1; i--) {
p = bi.preceding(i);
if (p != lastP) {
if (p == RuleBasedBreakIterator::DONE) {
break;
}
td.fActualBreakPositions.insertElementAt(p, 0, status);
lastP = p;
tag = bi.getRuleStatus();
td.fActualTags.insertElementAt(tag, 0, status);
}
}
if (i != 0) {
errln("testPreceding(): iterator returned DONE prematurely.");
}
td.checkResults("testPreceding", this);
}
void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
UErrorCode status = U_ZERO_ERROR;
int i;
int32_t tag;
logln("testIsBoundary():");
bi.setText(td.fDataToBreak);
td.clearResults();
for (i = 0; i <= td.fDataToBreak.length(); i++) {
if (bi.isBoundary(i)) {
td.fActualBreakPositions.addElement(i, status); tag = bi.getRuleStatus();
td.fActualTags.addElement(tag, status);
}
}
td.checkResults("testIsBoundary: ", this);
}
void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
{
iterator.setText(td.fDataToBreak);
RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
int32_t offset = iterator.first();
int32_t testOffset;
int32_t count = 0;
logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
if (*testIterator != iterator)
errln("clone() or operator!= failed: two clones compared unequal");
do {
testOffset = testIterator->first();
testOffset = testIterator->next(count);
if (offset != testOffset)
errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
if (offset != RuleBasedBreakIterator::DONE) {
count++;
offset = iterator.next();
if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
if (count > 10000 || offset == -1) {
errln("operator== failed too many times. Stopping test.");
if (offset == -1) {
errln("Does (RuleBasedBreakIterator::DONE == -1)?");
}
return;
}
}
}
} while (offset != RuleBasedBreakIterator::DONE);
offset = iterator.last();
count = 0;
do {
testOffset = testIterator->last();
testOffset = testIterator->next(count); if (offset != testOffset)
errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
if (offset != RuleBasedBreakIterator::DONE) {
count--;
offset = iterator.previous();
}
} while (offset != RuleBasedBreakIterator::DONE);
delete testIterator;
}
void RBBITest::TestCharacterInvariants()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
return;
}
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
doBreakInvariantTest(*e, s);
s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
doOtherInvariantTest(*e, s);
delete e;
}
void RBBITest::TestWordInvariants()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
return;
}
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
doBreakInvariantTest(*e, s);
s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
doOtherInvariantTest(*e, s);
delete e;
}
void RBBITest::TestSentenceInvariants()
{
UErrorCode status = U_ZERO_ERROR;
BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
return;
}
UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
doOtherInvariantTest(*e, s);
delete e;
}
void RBBITest::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
{
UnicodeString work("aaa");
int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;
UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");
int32_t i, j;
breaksLen = breaks.length();
for (i = 0; i < breaksLen; i++) {
UChar c1 = breaks[i];
work.setCharAt(1, c1);
for (j = 0; j < testCharsLen; j++) {
UChar c0 = testChars[j];
work.setCharAt(0, c0);
int k;
for (k = 0; k < testCharsLen; k++) {
UChar c2 = testChars[k];
work.setCharAt(2, c2);
if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029
|| c2 == 0x2028 || c2 == 0x0003))
continue;
if (u_charType(c1) == U_CONTROL_CHAR &&
(u_charType(c2) == U_NON_SPACING_MARK ||
u_charType(c2) == U_ENCLOSING_MARK ||
u_charType(c2) == U_COMBINING_SPACING_MARK)
) {
continue;
}
tb.setText(work);
UBool seen2 = FALSE;
int l;
for (l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
if (l == 2) {
seen2 = TRUE;
break;
}
}
if (!seen2) {
printStringBreaks(work, NULL, 0);
errln("No Break between \\U%04x and \\U%04x", c1, c2);
errCount++;
if (errCount >= 75)
return;
}
}
}
}
}
void RBBITest::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
{
UnicodeString work("a\r\na");
int32_t errCount = 0, testCharsLen = testChars.length();
int32_t i, j;
int8_t type;
for (i = 0; i < testCharsLen; i++) {
work.setCharAt(0, testChars[i]);
for (j = 0; j < testCharsLen; j++) {
work.setCharAt(3, testChars[j]);
tb.setText(work);
int32_t k;
for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())
if (k == 2) {
errln("Break between CR and LF in string U\\%04x U\\%04x U\\%04x U\\%04x",
work[0], work[1], work[2], work[3]);
errCount++;
if (errCount >= 75)
return;
}
}
}
work.remove();
work += "aaaa";
for (i = 0; i < testCharsLen; i++) {
UChar c1 = testChars[i];
if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||
u_charType(c1) == U_CONTROL_CHAR || u_charType(c1) == U_FORMAT_CHAR) {
continue;
}
work.setCharAt(1, c1);
for (j = 0; j < testCharsLen; j++) {
UChar c2 = testChars[j];
type = u_charType(c2);
if ((type != U_NON_SPACING_MARK) &&
(type != U_ENCLOSING_MARK)) {
continue;
}
work.setCharAt(2, c2);
tb.setText(work);
int k;
for (k = tb.first(); k != BreakIterator::DONE; k = tb.next())
if (k == 2) {
errln("Unexpected Break between %6x and %6x", c1, c2);
errCount++;
if (errCount >= 75)
return;
}
}
}
}
void RBBITest::TestEmptyString()
{
UnicodeString text = "";
UErrorCode status = U_ZERO_ERROR;
BITestData x(status);
ADD_DATACHUNK(x, "", 0, status); RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
return;
}
generalIteratorTest(*bi, x);
delete bi;
}
void RBBITest::TestGetAvailableLocales()
{
int32_t locCount = 0;
const Locale* locList = BreakIterator::getAvailableLocales(locCount);
if (locCount == 0)
errln("getAvailableLocales() returned an empty list!");
int32_t i;
for (i = 0; i < locCount; ++i) {
logln(locList[i].getName());
}
}
void RBBITest::TestGetDisplayName()
{
UnicodeString result;
BreakIterator::getDisplayName(Locale::getUS(), result);
if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
+ result);
BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
if (result != "French (France)")
errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
+ result);
}
void RBBITest::TestEndBehaviour()
{
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString("boo.");
BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
return;
}
wb->setText(testString);
if (wb->first() != 0)
errln("Didn't get break at beginning of string.");
if (wb->next() != 3)
errln("Didn't get break before period in \"boo.\"");
if (wb->current() != 4 && wb->next() != 4)
errln("Didn't get break at end of string.");
delete wb;
}
void RBBITest::TestBug4153072() {
UErrorCode status = U_ZERO_ERROR;
BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
return;
}
UnicodeString str("...Hello, World!...");
int32_t begin = 3;
int32_t end = str.length() - 3;
UBool dummy;
StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
iter->adoptText(textIterator);
int index;
for (index = -1; index < begin + 1; ++index) {
dummy = iter->isBoundary(index);
if (index < begin && dummy == TRUE) {
errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
" and begin index = " + begin);
}
}
delete iter;
}
void RBBITest::TestJapaneseLineBreak()
{
#if 0
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
UnicodeString precedingChars = CharsToUnicodeString(
"([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
UnicodeString followingChars = CharsToUnicodeString(
")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
"\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
int32_t i;
if (U_FAILURE(status))
{
errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
return;
}
for (i = 0; i < precedingChars.length(); i++) {
testString.setCharAt(1, precedingChars[i]);
iter->setText(testString);
int32_t j = iter->first();
if (j != 0)
errln("ja line break failure: failed to start at 0");
j = iter->next();
if (j != 1)
errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
+ "' (" + ((int)(precedingChars[i])) + ")");
j = iter->next();
if (j != 3)
errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
+ "' (" + ((int)(precedingChars[i])) + ")");
}
for (i = 0; i < followingChars.length(); i++) {
testString.setCharAt(1, followingChars[i]);
iter->setText(testString);
int j = iter->first();
if (j != 0)
errln("ja line break failure: failed to start at 0");
j = iter->next();
if (j != 2)
errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
+ "' (" + ((int)(followingChars[i])) + ")");
j = iter->next();
if (j != 3)
errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
+ "' (" + ((int)(followingChars[i])) + ")");
}
delete iter;
#endif
}
struct TestParams {
BreakIterator *bi;
UnicodeString dataToBreak;
UVector32 *expectedBreaks;
UVector32 *srcLine;
UVector32 *srcCol;
};
void RBBITest::executeTest(TestParams *t) {
int32_t bp;
int32_t prevBP;
int32_t i;
t->bi->setText(t->dataToBreak);
prevBP = -1;
for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
if (prevBP == bp) {
errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
break;
}
for (i=prevBP+1; i<bp; i++) {
if (t->expectedBreaks->elementAti(i) != 0) {
int expected[] = {0, i};
printStringBreaks(t->dataToBreak, expected, 2);
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
}
}
if (t->expectedBreaks->elementAti(bp) == 0) {
int expected[] = {0, bp};
printStringBreaks(t->dataToBreak, expected, 2);
errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
} else {
int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
}
}
prevBP = bp;
}
for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
if (t->expectedBreaks->elementAti(i) != 0) {
errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
}
}
prevBP = t->dataToBreak.length()+2; for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
if (prevBP == bp) {
errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
break;
}
for (i=prevBP-1; i>bp; i--) {
if (t->expectedBreaks->elementAti(i) != 0) {
errln("Reverse Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
}
}
if (t->expectedBreaks->elementAti(bp) == 0) {
errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
} else {
int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
if (expectedTagVal == -1) {
expectedTagVal = 0;
}
int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
if (rs != expectedTagVal) {
errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
" Actual, Expected status = %4d, %4d",
bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp), rs, expectedTagVal);
}
}
prevBP = bp;
}
for (i=prevBP-1; i>=0; i--) {
if (t->expectedBreaks->elementAti(i) != 0) {
errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
}
}
}
void RBBITest::TestExtended() {
UErrorCode status = U_ZERO_ERROR;
Locale locale = Locale::getDefault();
UnicodeString rules;
TestParams tp;
tp.bi = NULL;
tp.expectedBreaks = new UVector32(status);
tp.srcLine = new UVector32(status);
tp.srcCol = new UVector32(status);
const char *testDataDirectory = IntlTest::getSourceTestData(status);
char testFileName[1000];
if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
errln("Can't open test data. Path too long.");
return;
}
strcpy(testFileName, testDataDirectory);
strcat(testFileName, "rbbitst.txt");
int len;
UChar *testFile = ReadAndConvertFile(testFileName, len, status);
if (U_FAILURE(status)) {
return;
}
UnicodeString testString(FALSE, testFile, len);
enum EParseState{
PARSE_COMMENT,
PARSE_TAG,
PARSE_DATA,
PARSE_NUM
}
parseState = PARSE_TAG;
EParseState savedState = PARSE_TAG;
static const UChar CH_LF = 0x0a;
static const UChar CH_CR = 0x0d;
static const UChar CH_HASH = 0x23;
static const UChar CH_LT = 0x3c;
static const UChar CH_GT = 0x3e;
static const UChar CH_BACKSLASH = 0x5c;
static const UChar CH_BULLET = 0x2022;
int32_t lineNum = 1;
int32_t colStart = 0;
int32_t column = 0;
int32_t charIdx = 0;
int32_t tagValue = 0;
for (charIdx = 0; charIdx < len; ) {
UChar c = testString.charAt(charIdx);
charIdx++;
if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
c = CH_LF;
charIdx++;
}
if (c == CH_LF || c == CH_CR) {
lineNum++;
colStart = charIdx;
}
column = charIdx - colStart + 1;
switch (parseState) {
case PARSE_COMMENT:
if (c == 0x0a || c == 0x0d) {
parseState = savedState;
}
break;
case PARSE_TAG:
{
if (c == CH_HASH) {
parseState = PARSE_COMMENT;
savedState = PARSE_TAG;
break;
}
if (u_isUWhiteSpace(c)) {
break;
}
if (testString.compare(charIdx-1, 6, "<word>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createWordInstance(locale, status);
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 6, "<char>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createCharacterInstance(locale, status);
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 6, "<line>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createLineInstance(locale, status);
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createSentenceInstance(locale, status);
charIdx += 5;
break;
}
if (testString.compare(charIdx-1, 7, "<title>") == 0) {
delete tp.bi;
tp.bi = BreakIterator::createTitleInstance(locale, status);
charIdx += 6;
break;
}
if (testString.compare(charIdx-1, 6, "<data>") == 0) {
parseState = PARSE_DATA;
charIdx += 5;
tp.dataToBreak = "";
tp.expectedBreaks->removeAllElements();
tp.srcCol ->removeAllElements();
tp.srcLine->removeAllElements();
break;
}
errln("line %d: Tag expected in test file.", lineNum);
goto end_test;
parseState = PARSE_COMMENT;
savedState = PARSE_DATA;
}
break;
case PARSE_DATA:
if (c == CH_BULLET) {
int32_t breakIdx = tp.dataToBreak.length();
tp.expectedBreaks->setSize(breakIdx+1);
tp.expectedBreaks->setElementAt(-1, breakIdx);
tp.srcLine->setSize(breakIdx+1);
tp.srcLine->setElementAt(lineNum, breakIdx);
tp.srcCol ->setSize(breakIdx+1);
tp.srcCol ->setElementAt(column, breakIdx);
break;
}
if (testString.compare(charIdx-1, 7, "</data>") == 0) {
tp.srcLine->addElement(lineNum, status);
tp.srcCol ->addElement(column, status);
parseState = PARSE_TAG;
charIdx += 7;
executeTest(&tp);
break;
}
if (testString.compare(charIdx-1, 3, "\\N{") == 0) {
int32_t nameEndIdx = testString.indexOf((UChar)0x7d, charIdx);
int32_t nameLength = nameEndIdx - (charIdx+2);
char charNameBuf[200];
UChar32 theChar = -1;
if (nameEndIdx != -1) {
UErrorCode status = U_ZERO_ERROR;
testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
charNameBuf[sizeof(charNameBuf)-1] = 0;
theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
if (U_FAILURE(status)) {
theChar = -1;
}
}
if (theChar == -1) {
errln("Error in named character in test file at line %d, col %d",
lineNum, column);
} else {
tp.dataToBreak.append(theChar);
while (tp.dataToBreak.length() > tp.srcLine->size()) {
tp.srcLine->addElement(lineNum, status);
tp.srcCol ->addElement(column, status);
}
}
if (nameEndIdx > charIdx) {
charIdx = nameEndIdx+1;
}
break;
}
if (testString.compare(charIdx-1, 2, "<>") == 0) {
charIdx++;
int32_t breakIdx = tp.dataToBreak.length();
tp.expectedBreaks->setSize(breakIdx+1);
tp.expectedBreaks->setElementAt(-1, breakIdx);
tp.srcLine->setSize(breakIdx+1);
tp.srcLine->setElementAt(lineNum, breakIdx);
tp.srcCol ->setSize(breakIdx+1);
tp.srcCol ->setElementAt(column, breakIdx);
break;
}
if (c == CH_LT) {
tagValue = 0;
parseState = PARSE_NUM;
break;
}
if (c == CH_HASH && column==3) { parseState = PARSE_COMMENT;
savedState = PARSE_DATA;
break;
}
if (c == CH_BACKSLASH) {
UChar32 cp = testString.char32At(charIdx);
if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
charIdx++;
}
if (cp == CH_LF || cp == CH_CR) {
lineNum++;
colStart = charIdx;
charIdx++;
break;
}
cp = testString.unescapeAt(charIdx);
if (cp != -1) {
tp.dataToBreak.append(cp);
while (tp.dataToBreak.length() > tp.srcLine->size()) {
tp.srcLine->addElement(lineNum, status);
tp.srcCol ->addElement(column, status);
}
break;
}
c = testString.charAt(charIdx);
charIdx = testString.moveIndex32(charIdx, 1);
}
tp.dataToBreak.append(c);
if (tp.dataToBreak.length() > tp.srcLine->size()) {
tp.srcLine->addElement(lineNum, status);
tp.srcCol ->addElement(column, status);
}
break;
case PARSE_NUM:
if (u_isUWhiteSpace(c)) {
break;
}
if (c == CH_GT) {
parseState = PARSE_DATA;
if (tagValue == 0) {
tagValue = -1;
}
int32_t breakIdx = tp.dataToBreak.length();
tp.expectedBreaks->setSize(breakIdx+1);
tp.expectedBreaks->setElementAt(tagValue, breakIdx);
tp.srcLine->setSize(breakIdx+1);
tp.srcLine->setElementAt(lineNum, breakIdx);
tp.srcCol ->setSize(breakIdx+1);
tp.srcCol ->setElementAt(column, breakIdx);
break;
}
if (u_isdigit(c)) {
tagValue = tagValue*10 + u_charDigitValue(c);
break;
}
errln("Syntax Error in test file at line %d, col %d",
lineNum, column);
goto end_test;
parseState = PARSE_COMMENT;
break;
}
if (U_FAILURE(status)) {
errln("ICU Error %s while parsing test file at line %d.",
u_errorName(status), lineNum);
goto end_test;
status = U_ZERO_ERROR;
}
}
end_test:
delete tp.bi;
delete tp.expectedBreaks;
delete tp.srcLine;
delete tp.srcCol;
delete [] testFile;
}
UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, UErrorCode &status) {
UChar *retPtr = NULL;
char *fileBuf = NULL;
UConverter* conv = NULL;
FILE *f = NULL;
ulen = 0;
if (U_FAILURE(status)) {
return retPtr;
}
f = fopen(fileName, "rb");
if (f == 0) {
errln("Error opening test data file %s\n", fileName);
status = U_FILE_ACCESS_ERROR;
return NULL;
}
int fileSize;
int amt_read;
fseek( f, 0, SEEK_END);
fileSize = ftell(f);
fileBuf = new char[fileSize];
fseek(f, 0, SEEK_SET);
amt_read = fread(fileBuf, 1, fileSize, f);
if (amt_read != fileSize || fileSize <= 0) {
errln("Error reading test data file.");
goto cleanUpAndReturn;
}
int32_t signatureLength;
const char * fileBufC;
const char* encoding;
fileBufC = fileBuf;
encoding = ucnv_detectUnicodeSignature(
fileBuf, fileSize, &signatureLength, &status);
if(encoding!=NULL ){
fileBufC += signatureLength;
fileSize -= signatureLength;
}
conv = ucnv_open(encoding, &status);
if (U_FAILURE(status)) {
goto cleanUpAndReturn;
}
ulen = ucnv_toUChars(conv,
NULL, 0, fileBufC,
fileSize,
&status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
retPtr = new UChar[ulen+1];
ucnv_toUChars(conv,
retPtr, ulen+1,
fileBufC,
fileSize,
&status);
}
cleanUpAndReturn:
fclose(f);
delete fileBuf;
ucnv_close(conv);
if (U_FAILURE(status)) {
errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
delete retPtr;
retPtr = 0;
ulen = 0;
};
return retPtr;
}
struct ScanState {
int32_t fPeekChar;
UBool fPeeked;
int32_t fLineNum;
FILE *fFile;
ScanState() :fPeeked(FALSE), fLineNum(0), fFile(NULL) {};
};
static const int32_t chSpace = 0x20;
static const int32_t chTab = 0x09;
static const int32_t chCR = 0x0D;
static const int32_t chLF = 0x0A;
static const int32_t chHash = 0x23;
static const int32_t chMult = 0xD7;
static const int32_t chDivide = 0xF7;
static int32_t nextLBDToken(ScanState *s) {
int32_t c;
for (;;) {
if (s->fPeeked) {
c = s->fPeekChar;
s->fPeeked = FALSE;
} else {
c = getc(s->fFile);
}
if (c == EOF) {
return -3;
}
if (c == chSpace || c == chTab || c == chMult) {
continue;
}
if (c == chDivide) {
return -1;
}
if (c == chCR) {
s->fLineNum++;
s->fPeekChar = getc(s->fFile);
if (s->fPeekChar != chLF) {s->fPeeked = TRUE;};
return -2;
}
if (c == chLF) {
s->fLineNum++;
return -2;
}
if (c == chHash) {
do {
c = getc(s->fFile);
} while (!(c == EOF || c == chCR || c == chLF));
s->fPeekChar = c;
s->fPeeked = TRUE;
return nextLBDToken(s);
}
if (u_digit(c, 16) >= 0) {
int32_t v = u_digit(c, 16);
for (;;) {
c = getc(s->fFile);
if (u_digit(c, 16) < 0) {break;};
v <<= 4;
v += u_digit(c, 16);
}
s->fPeekChar = c;
s->fPeeked = TRUE;
return v;
}
return -4;
}
}
void RBBITest::TestLineBreakData() {
UErrorCode status = U_ZERO_ERROR;
UnicodeString testString;
UVector expectedBreaks(status);
ScanState ss;
int32_t tok;
BreakIterator *bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
if (U_FAILURE(status)) {
errln("Failure creating break iterator");
return;
}
const char * lbdfName = "LBTest.txt";
ss.fFile = fopen(lbdfName, "rb");
if (ss.fFile == NULL) {
logln("Unable to open Line Break Test Data file. Skipping test.");
delete bi;
return;
}
for (;;) {
testString.truncate(0);
expectedBreaks.removeAllElements();
for(;;) {
tok = nextLBDToken(&ss);
if (tok >= 0) {
testString.append((UChar32)tok);
continue;
}
if (tok == -1) {
expectedBreaks.addElement(testString.length(), status);
continue;
}
if (tok == -2 || tok == -3) {break;};
errln("Failure: Unrecognized data format, test file line %d", ss.fLineNum);
break;
}
if (testString.length() > 0) {
int32_t pos; int32_t expectedI = 0; int32_t expectedPos;
bi->setText(testString);
pos = bi->first(); pos = bi->next();
for (; pos != BreakIterator::DONE; ) {
expectedPos = expectedBreaks.elementAti(expectedI);
if (pos < expectedPos) {
errln("Failure: Test file line %d, unexpected break found at position %d",
ss.fLineNum, pos);
break;
}
if (pos > expectedPos) {
errln("Failure: Test file line %d, failed to find break at position %d",
ss.fLineNum, expectedPos);
break;
}
pos = bi->next();
expectedI++;
}
}
if (tok == -3) {
break;
}
}
fclose(ss.fFile);
delete bi;
}
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
class RBBIMonkeyKind {
public:
virtual UVector *charClasses() = 0;
virtual void setText(const UnicodeString &s) = 0;
virtual int32_t next(int32_t i) = 0;
virtual ~RBBIMonkeyKind();
UErrorCode deferredStatus;
protected:
RBBIMonkeyKind();
private:
};
RBBIMonkeyKind::RBBIMonkeyKind() {
deferredStatus = U_ZERO_ERROR;
}
RBBIMonkeyKind::~RBBIMonkeyKind() {
}
static uint32_t m_seed = 1;
static uint32_t m_rand()
{
m_seed = m_seed * 1103515245 + 12345;
return (uint32_t)(m_seed/65536) % 32768;
}
class RBBICharMonkey: public RBBIMonkeyKind {
public:
RBBICharMonkey();
virtual ~RBBICharMonkey();
virtual UVector *charClasses();
virtual void setText(const UnicodeString &s);
virtual int32_t next(int32_t i);
private:
UVector *fSets;
UnicodeSet *fCRLFSet;
UnicodeSet *fControlSet;
UnicodeSet *fExtendSet;
UnicodeSet *fHangulSet;
UnicodeSet *fAnySet;
RegexMatcher *fMatcher;
const UnicodeString *fText;
};
RBBICharMonkey::RBBICharMonkey() {
UErrorCode status = U_ZERO_ERROR;
fText = NULL;
fMatcher = new RegexMatcher("\\X", 0, status);
fCRLFSet = new UnicodeSet("[\\r\\n]", status);
fControlSet = new UnicodeSet("[[\\p{Zl}\\p{Zp}\\p{Cc}\\p{Cf}]-[\\n]-[\\r]-\\p{Grapheme_Extend}]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
fHangulSet = new UnicodeSet(
"[\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=L}\\p{Hangul_Syllable_Type=T}"
"\\p{Hangul_Syllable_Type=LV}\\p{Hangul_Syllable_Type=LVT}]", status);
fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]", status);
fSets = new UVector(status);
fSets->addElement(fCRLFSet, status);
fSets->addElement(fControlSet, status);
fSets->addElement(fExtendSet, status);
fSets->addElement(fHangulSet, status);
fSets->addElement(fAnySet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
};
void RBBICharMonkey::setText(const UnicodeString &s) {
fText = &s;
fMatcher->reset(s);
}
int32_t RBBICharMonkey::next(int32_t i) {
UErrorCode status = U_ZERO_ERROR;
int32_t retVal = -1;
if (fMatcher->find(i, status)) {
retVal = fMatcher->end(status);
}
if (U_FAILURE(status)){
retVal = -1;
}
return retVal;
}
UVector *RBBICharMonkey::charClasses() {
return fSets;
}
RBBICharMonkey::~RBBICharMonkey() {
delete fSets;
delete fCRLFSet;
delete fControlSet;
delete fExtendSet;
delete fHangulSet;
delete fAnySet;
delete fMatcher;
}
class RBBIWordMonkey: public RBBIMonkeyKind {
public:
RBBIWordMonkey();
virtual ~RBBIWordMonkey();
virtual UVector *charClasses();
virtual void setText(const UnicodeString &s);
virtual int32_t next(int32_t i);
private:
UVector *fSets;
UnicodeSet *fKatakanaSet;
UnicodeSet *fALetterSet;
UnicodeSet *fMidLetterSet;
UnicodeSet *fMidNumSet;
UnicodeSet *fNumericSet;
UnicodeSet *fFormatSet;
UnicodeSet *fOtherSet;
UnicodeSet *fExtendSet;
UnicodeSet *fExtendNumLetSet;
RegexMatcher *fMatcher;
const UnicodeString *fText;
RegexMatcher *fGCFMatcher;
RegexMatcher *fGCMatcher;
};
RBBIWordMonkey::RBBIWordMonkey() : fGCFMatcher(0),
fGCMatcher(0)
{
UErrorCode status = U_ZERO_ERROR;
fSets = new UVector(status);
fKatakanaSet = new UnicodeSet("[\\p{script=KATAKANA}"
"\\u3031-\\u3035\\u309b\\u309c\\u30a0"
"\\u30fc\\uff70\\uff9e\\uff9f]", status);
const UnicodeString ALetterStr( "[[\\p{Alphabetic}"
"\\u00a0" "\\u05f3]" "-[\\p{Ideographic}]"
"-[\\p{Script=Lao}]"
"-[\\p{Script=Hiragana}]"
"-[\\p{Grapheme_Extend}]]");
fALetterSet = new UnicodeSet(ALetterStr, status);
fALetterSet->removeAll(*fKatakanaSet);
fMidLetterSet = new UnicodeSet("[\\u0027\\u00b7\\u05f4\\u2019\\u2027\\u003a]", status);
fMidNumSet = new UnicodeSet("[[\\p{Line_Break=Infix_Numeric}]-[\\u003a]]", status);
fNumericSet = new UnicodeSet("[\\p{Nd}\\u066b\\u066c]", status);
fFormatSet = new UnicodeSet("[\\p{Format}-[\\u200c\\u200d]]", status);
fExtendSet = new UnicodeSet("[\\p{Grapheme_Extend}]", status);
fExtendNumLetSet = new UnicodeSet("[\\p{Pc}-[\\u30fb\\uff65]]", status);
fOtherSet = new UnicodeSet();
if(U_FAILURE(status)) {
deferredStatus = status;
return;
}
fOtherSet->complement();
fOtherSet->removeAll(*fKatakanaSet);
fOtherSet->removeAll(*fALetterSet);
fOtherSet->removeAll(*fMidLetterSet);
fOtherSet->removeAll(*fMidNumSet);
fOtherSet->removeAll(*fNumericSet);
fOtherSet->removeAll(*fExtendNumLetSet);
fSets->addElement(fALetterSet, status);
fSets->addElement(fKatakanaSet, status);
fSets->addElement(fMidLetterSet, status);
fSets->addElement(fMidNumSet, status);
fSets->addElement(fNumericSet, status);
fSets->addElement(fFormatSet, status);
fSets->addElement(fOtherSet, status);
fSets->addElement(fExtendNumLetSet, status);
fGCFMatcher = new RegexMatcher("\\X(?:[\\p{Format}-\\p{Grapheme_Extend}])*", 0, status);
fGCMatcher = new RegexMatcher("\\X", 0, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
};
void RBBIWordMonkey::setText(const UnicodeString &s) {
fText = &s;
fGCMatcher->reset(*fText);
fGCFMatcher->reset(*fText);
}
int32_t RBBIWordMonkey::next(int32_t prevPos) {
UErrorCode status = U_ZERO_ERROR;
int p0, p1, p2, p3;
int breakPos = -1;
UChar32 c0, c1, c2, c3;
if (prevPos >= fText->length()) {
return -1;
}
p0 = p1 = p2 = p3 = prevPos;
c3 = fText->char32At(prevPos);
c0 = c1 = c2 = 0;
if (fFormatSet->contains(c3)) {
breakPos = fText->moveIndex32(prevPos, 1);
return breakPos;
}
for (;;) {
p0 = p1; c0 = c1;
p1 = p2; c1 = c2;
p2 = p3; c2 = c3;
status = U_ZERO_ERROR;
if (fGCFMatcher->find(p3, status) == FALSE) {
p3 = fText->length();
c3 = 0;
} else {
p3 = fGCFMatcher->end(0, status);
U_ASSERT(U_SUCCESS(status));
c3 = fText->char32At(p3);
}
if (p1 == p2) {
continue;
}
if (p2 == fText->length()) {
break;
}
if (fALetterSet->contains(c1) &&
fALetterSet->contains(c2)) {
continue;
}
if ( fALetterSet->contains(c1) &&
fMidLetterSet->contains(c2) &&
fALetterSet->contains(c3)) {
continue;
}
if (fALetterSet->contains(c0) &&
(fMidLetterSet->contains(c1) ) &&
fALetterSet->contains(c2)) {
continue;
}
if (fNumericSet->contains(c1) &&
fNumericSet->contains(c2)) {
continue;
}
if (fALetterSet->contains(c1) &&
fNumericSet->contains(c2)) {
continue;
}
if (fNumericSet->contains(c1) &&
fALetterSet->contains(c2)) {
continue;
}
if ( fNumericSet->contains(c0) &&
fMidNumSet->contains(c1) &&
fNumericSet->contains(c2)) {
continue;
}
if (fNumericSet->contains(c1) &&
fMidNumSet->contains(c2) &&
fNumericSet->contains(c3)) {
continue;
}
if (fKatakanaSet->contains(c1) &&
fKatakanaSet->contains(c2)) {
continue;
}
if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
fExtendNumLetSet->contains(c2)) {
continue;
}
if (fExtendNumLetSet->contains(c1) &&
(fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
fKatakanaSet->contains(c2))) {
continue;
}
break;
}
breakPos = p2;
status = U_ZERO_ERROR;
if (fGCMatcher->find(p1, status)) {
breakPos = fGCMatcher->end(0, status);
U_ASSERT(U_SUCCESS(status));
}
return breakPos;
}
UVector *RBBIWordMonkey::charClasses() {
return fSets;
}
RBBIWordMonkey::~RBBIWordMonkey() {
delete fSets;
delete fKatakanaSet;
delete fALetterSet;
delete fMidLetterSet;
delete fMidNumSet;
delete fNumericSet;
delete fFormatSet;
delete fExtendSet;
delete fOtherSet;
delete fGCFMatcher;
delete fGCMatcher;
}
class RBBILineMonkey: public RBBIMonkeyKind {
public:
RBBILineMonkey();
virtual ~RBBILineMonkey();
virtual UVector *charClasses();
virtual void setText(const UnicodeString &s);
virtual int32_t next(int32_t i);
virtual void rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
private:
UVector *fSets;
UnicodeSet *fBK;
UnicodeSet *fCR;
UnicodeSet *fLF;
UnicodeSet *fCM;
UnicodeSet *fNL;
UnicodeSet *fSG;
UnicodeSet *fWJ;
UnicodeSet *fZW;
UnicodeSet *fGL;
UnicodeSet *fCB;
UnicodeSet *fSP;
UnicodeSet *fB2;
UnicodeSet *fBA;
UnicodeSet *fBB;
UnicodeSet *fHY;
UnicodeSet *fCL;
UnicodeSet *fEX;
UnicodeSet *fIN;
UnicodeSet *fNS;
UnicodeSet *fOP;
UnicodeSet *fQU;
UnicodeSet *fIS;
UnicodeSet *fNU;
UnicodeSet *fPO;
UnicodeSet *fPR;
UnicodeSet *fSY;
UnicodeSet *fAI;
UnicodeSet *fAL;
UnicodeSet *fID;
UnicodeSet *fSA;
UnicodeSet *fXX;
BreakIterator *fCharBI;
const UnicodeString *fText;
int32_t *fOrigPositions;
RegexMatcher *fNumberMatcher;
RegexMatcher *fLB10Matcher;
RegexMatcher *fLB11Matcher;
};
RBBILineMonkey::RBBILineMonkey()
{
UErrorCode status = U_ZERO_ERROR;
fSets = new UVector(status);
fBK = new UnicodeSet("[\\p{Line_Break=BK}]", status);
fCR = new UnicodeSet("[\\p{Line_break=CR}]", status);
fLF = new UnicodeSet("[\\p{Line_break=LF}]", status);
fCM = new UnicodeSet("[\\p{Line_break=CM}]", status);
fNL = new UnicodeSet("[\\p{Line_break=NL}]", status);
fWJ = new UnicodeSet("[\\p{Line_break=WJ}]", status);
fZW = new UnicodeSet("[\\p{Line_break=ZW}]", status);
fGL = new UnicodeSet("[\\p{Line_break=GL}]", status);
fCB = new UnicodeSet("[\\p{Line_break=CB}]", status);
fSP = new UnicodeSet("[\\p{Line_break=SP}]", status);
fB2 = new UnicodeSet("[\\p{Line_break=B2}]", status);
fBA = new UnicodeSet("[\\p{Line_break=BA}]", status);
fBB = new UnicodeSet("[\\p{Line_break=BB}]", status);
fHY = new UnicodeSet("[\\p{Line_break=HY}]", status);
fCL = new UnicodeSet("[\\p{Line_break=CL}]", status);
fEX = new UnicodeSet("[\\p{Line_break=EX}]", status);
fIN = new UnicodeSet("[\\p{Line_break=IN}]", status);
fNS = new UnicodeSet("[\\p{Line_break=NS}]", status);
fOP = new UnicodeSet("[\\p{Line_break=OP}]", status);
fQU = new UnicodeSet("[\\p{Line_break=QU}]", status);
fIS = new UnicodeSet("[\\p{Line_break=IS}]", status);
fNU = new UnicodeSet("[\\p{Line_break=NU}]", status);
fPO = new UnicodeSet("[\\p{Line_break=PO}]", status);
fPR = new UnicodeSet("[\\p{Line_break=PR}]", status);
fSY = new UnicodeSet("[\\p{Line_break=SY}]", status);
fAI = new UnicodeSet("[\\p{Line_break=AI}]", status);
fAL = new UnicodeSet("[\\p{Line_break=AL}]", status);
fID = new UnicodeSet("[\\p{Line_break=ID}]", status);
fSA = new UnicodeSet("[\\p{Line_break=SA}]", status);
fXX = new UnicodeSet("[\\p{Line_break=XX}]", status);
fAL->addAll(*fXX); fAL->addAll(*fAI); fAL->addAll(*fSA);
fSets->addElement(fBK, status);
fSets->addElement(fCR, status);
fSets->addElement(fLF, status);
fSets->addElement(fCM, status);
fSets->addElement(fNL, status);
fSets->addElement(fWJ, status);
fSets->addElement(fZW, status);
fSets->addElement(fGL, status);
fSets->addElement(fCB, status);
fSets->addElement(fSP, status);
fSets->addElement(fB2, status);
fSets->addElement(fBA, status);
fSets->addElement(fBB, status);
fSets->addElement(fHY, status);
fSets->addElement(fCL, status);
fSets->addElement(fEX, status);
fSets->addElement(fIN, status);
fSets->addElement(fNS, status);
fSets->addElement(fOP, status);
fSets->addElement(fQU, status);
fSets->addElement(fIS, status);
fSets->addElement(fNU, status);
fSets->addElement(fPO, status);
fSets->addElement(fPR, status);
fSets->addElement(fSY, status);
fSets->addElement(fAI, status);
fSets->addElement(fAL, status);
fSets->addElement(fID, status);
fSets->addElement(fWJ, status);
fSets->addElement(fSA, status);
fNumberMatcher = new RegexMatcher(
"(\\p{Line_Break=PR}\\p{Line_Break=CM}*)?"
"((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
"\\p{Line_Break=NU}\\p{Line_Break=CM}*"
"((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
"(\\p{Line_Break=CL}\\p{Line_Break=CM}*)?"
"(\\p{Line_Break=PO}\\p{Line_Break=CM}*)?",
0, status);
fLB10Matcher = new RegexMatcher(
"\\p{Line_Break=QU}\\p{Line_Break=CM}*"
"\\p{Line_Break=SP}*"
"(\\p{Line_Break=OP})\\p{Line_Break=CM}*",
0, status);
fLB11Matcher = new RegexMatcher(
"\\p{Line_Break=CL}\\p{Line_Break=CM}*"
"\\p{Line_Break=SP}*"
"(\\p{Line_Break=NS})\\p{Line_Break=CM}*",
0, status);
fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
};
void RBBILineMonkey::setText(const UnicodeString &s) {
fText = &s;
fCharBI->setText(s);
fNumberMatcher->reset(s);
}
void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
if (pos == -1) {
return;
}
int32_t nPos = *nextPos;
int32_t hangultype = u_getIntPropertyValue(*posChar, UCHAR_HANGUL_SYLLABLE_TYPE);
if (hangultype != U_HST_NOT_APPLICABLE) {
nPos = fCharBI->following(pos); for (;;) {
nPos = fText->moveIndex32(nPos, -1);
UChar32 possiblyExtendChar = fText->char32At(nPos);
if (fID->contains(possiblyExtendChar)) {
nPos = fText->moveIndex32(nPos, +1);
break;
}
}
}
if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a
|| *posChar==0x0d || *posChar==0x85)) {
for (;;) {
*nextChar = fText->char32At(nPos);
if (!fCM->contains(*nextChar)) {
break;
}
nPos = fText->moveIndex32(nPos, 1);
}
}
if (nPos != *nextPos && fSP->contains(*posChar)) {
*posChar = 0x4e00; }
if (fCM->contains(*posChar)) {
*posChar = 0x41; }
*nextPos = nPos;
*nextChar = fText->char32At(nPos);
}
int32_t RBBILineMonkey::next(int32_t startPos) {
UErrorCode status = U_ZERO_ERROR;
int32_t pos; UChar32 thisChar;
int32_t prevPos; UChar32 prevChar;
int32_t nextPos; int32_t nextCPPos; int32_t tPos; UChar32 c;
if (startPos >= fText->length()) {
return -1;
}
pos = prevPos = -1; thisChar = prevChar = 0;
nextPos = nextCPPos = startPos;
for (;;) {
prevPos = pos;
prevChar = thisChar;
pos = nextPos;
thisChar = fText->char32At(pos);
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
if (pos >= fText->length()) {
break;
}
if (fBK->contains(prevChar)) {
break;
}
if (prevChar == 0x0d && thisChar == 0x0a) {
continue;
}
if (prevChar == 0x0d ||
prevChar == 0x0a ||
prevChar == 0x85) {
break;
}
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
fBK->contains(thisChar)) {
continue;
}
if (prevPos >= 0) {
UnicodeString subStr10(*fText, prevPos);
fLB10Matcher->reset(subStr10);
status = U_ZERO_ERROR;
if (fLB10Matcher->lookingAt(status)) { pos = prevPos + fLB10Matcher->start(1, status);
nextPos = prevPos + fLB10Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
}
if (prevPos >= 0) {
UnicodeString subStr11(*fText, prevPos);
fLB11Matcher->reset(subStr11);
status = U_ZERO_ERROR;
if (fLB11Matcher->lookingAt(status)) { pos = prevPos + fLB11Matcher->start(1, status);
nextPos = prevPos + fLB11Matcher->end(0, status);
thisChar = fText->char32At(pos);
continue;
}
}
if (fSP->contains(thisChar)) {
continue;
}
if (fZW->contains(thisChar)) {
continue;
}
if (fZW->contains(prevChar)) {
break;
}
rule67Adjust(prevPos, &prevChar, &pos, &thisChar);
nextCPPos = fText->moveIndex32(pos, 1);
nextPos = nextCPPos;
c = fText->char32At(nextPos);
if (fSP->contains(thisChar)) {
continue;
}
rule67Adjust(pos, &thisChar, &nextPos, &c);
if (prevPos == -1) {
continue;
}
if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || fBK->contains(thisChar)) {
continue;
}
if (fSP->contains(thisChar)) { continue;
}
if (fZW->contains(thisChar)) { continue;
}
if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
fEX->contains(thisChar) ||
!fNU->contains(prevChar) && fIS->contains(thisChar) ||
!fNU->contains(prevChar) && fSY->contains(thisChar)) {
continue;
}
tPos = prevPos;
if (fSP->contains(prevChar)) {
while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
tPos=fText->moveIndex32(tPos, -1);
}
}
while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
tPos=fText->moveIndex32(tPos, -1);
}
if (fOP->contains(fText->char32At(tPos))) {
continue;
}
if (fB2->contains(thisChar) && fB2->contains(prevChar)) {
continue;
}
if (fGL->contains(thisChar) || fGL->contains(prevChar)) {
continue;
}
if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
continue;
}
if (fSP->contains(prevChar)) {
break;
}
if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
continue;
}
if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
break;
}
if (fBA->contains(thisChar) ||
fHY->contains(thisChar) ||
fNS->contains(thisChar) ||
fBB->contains(prevChar) ) {
continue;
}
if (fAL->contains(prevChar) && fIN->contains(thisChar) ||
fID->contains(prevChar) && fIN->contains(thisChar) ||
fIN->contains(prevChar) && fIN->contains(thisChar) ||
fNU->contains(prevChar) && fIN->contains(thisChar) ) {
continue;
}
if (fID->contains(prevChar) && fPO->contains(thisChar) ||
fCM->contains(prevChar) && fPO->contains(thisChar) ||
fAL->contains(prevChar) && fNU->contains(thisChar) ||
fNU->contains(prevChar) && fAL->contains(thisChar) ) {
continue;
}
UnicodeString subStr18(*fText, prevPos);
fNumberMatcher->reset(subStr18);
if (fNumberMatcher->lookingAt(status)) {
int32_t numEndIdx = prevPos + fNumberMatcher->end(status); if (numEndIdx > pos) {
if (numEndIdx > nextPos) {
nextPos = numEndIdx;
pos = fCharBI->preceding(numEndIdx);
thisChar = fText->char32At(pos);
while (fCM->contains(thisChar)) {
pos = fCharBI->preceding(pos);
thisChar = fText->char32At(pos);
}
}
continue;
}
}
if (fPR->contains(prevChar) && fAL->contains(thisChar)) {
continue;
}
if (fPR->contains(prevChar) && fID->contains(thisChar)) {
continue;
}
if (fHY->contains(prevChar) || fBB->contains(thisChar)) {
break;
}
if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
continue;
}
if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
continue;
}
break;
}
return pos;
}
UVector *RBBILineMonkey::charClasses() {
return fSets;
}
RBBILineMonkey::~RBBILineMonkey() {
delete fSets;
delete fBK;
delete fCR;
delete fLF;
delete fCM;
delete fNL;
delete fWJ;
delete fZW;
delete fGL;
delete fCB;
delete fSP;
delete fB2;
delete fBA;
delete fBB;
delete fHY;
delete fCL;
delete fEX;
delete fIN;
delete fNS;
delete fOP;
delete fQU;
delete fIS;
delete fNU;
delete fPO;
delete fPR;
delete fSY;
delete fAI;
delete fAL;
delete fID;
delete fSA;
delete fXX;
delete fCharBI;
delete fNumberMatcher;
delete fLB10Matcher;
delete fLB11Matcher;
}
static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
int32_t val = defaultVal;
name.append(" *= *(-?\\d+)");
UErrorCode status = U_ZERO_ERROR;
RegexMatcher m(name, params, 0, status);
if (m.find()) {
char valString[100];
int32_t paramLength = m.end(1, status) - m.start(1, status);
if (paramLength >= (int32_t)(sizeof(valString)-1)) {
paramLength = (int32_t)(sizeof(valString)-2);
}
params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
val = strtol(valString, NULL, 10);
m.reset();
params = m.replaceFirst("", status);
}
U_ASSERT(U_SUCCESS(status));
return val;
}
#endif
static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
BreakIterator *bi,
int expected[],
int expectedcount)
{
int count = 0;
int i = 0;
int forward[50];
bi->setText(ustr);
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count] = i;
if (count < expectedcount && expected[count] != i) {
test->errln("break forward test failed: expected %d but got %d",
expected[count], i);
break;
}
count ++;
}
if (count != expectedcount) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("break test failed: missed %d match",
expectedcount - count);
return;
}
for (i = 1; i < expectedcount; i ++) {
int j = expected[i - 1];
if (!bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("Expected boundary at position %d", j);
return;
}
for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("Not expecting boundary at position %d", j);
return;
}
}
}
for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
count --;
if (forward[count] != i) {
test->errln("happy break test reverse failed: expected %d but got %d",
forward[count], i);
break;
}
}
if (count != 0) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("happy break test failed: missed a match");
return;
}
for (i = 0; i < expectedcount - 1; i ++) {
int j = expected[i] + 1;
for (; j <= expected[i + 1]; j ++) {
if (bi->preceding(j) != expected[i]) {
printStringBreaks(ustr, expected, expectedcount);
test->errln("Not expecting backwards boundary at position %d", j);
return;
}
}
}
}
void RBBITest::TestWordBreaks(void)
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
UChar str[300];
static const char *strlist[] =
{
"\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
"\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
"\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a",
"\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
"\\u90ca\\u3588\\u009c\\u0953\\u194b",
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
"\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
"\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
"\\u2027\\U000e0067\\u0a47\\u00b7",
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
"\\u0589\\U000e006e\\u0a42\\U000104a5",
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
"\\u0027\\u11af\\U000e0057\\u0602",
"\\U0001d7f2\\U000e007\\u0004\\u0589",
"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
"\\u0233\\U000e0020\\u0a69\\u0d6a",
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
"\\u58f4\\U000e0049\\u20e7\\u2027",
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
"\\ua183\\u102d\\u0bec\\u003a",
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
"\\u003a\\u0e57\\u0fad\\u002e",
"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
"\\U000e005d\\u2044\\u0731\\u0650\\u0061",
"\\u003a\\u0664\\u00b7\\u1fba",
"\\u003b\\u0027\\u00b7\\u47a3",
"\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
"\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
"\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
};
int loop;
if (U_FAILURE(status)) {
errln("Creation of break iterator failed %s", u_errorName(status));
return;
}
for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
u_unescape(strlist[loop], str, 25);
UnicodeString ustr(str);
RBBIWordMonkey monkey;
int expected[50];
int expectedcount = 0;
monkey.setText(ustr);
int i;
for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
expected[expectedcount ++] = i;
}
testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
}
delete bi;
#endif
}
void RBBITest::TestWordBoundary(void)
{
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
UChar str[50];
static const char *strlist[] =
{
"\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
"\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
"\\u003b\\u024a\\u102e\\U000e0071\\u0600",
"\\u2027\\U000e0067\\u0a47\\u00b7",
"\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
"\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
"\\u0589\\U000e006e\\u0a42\\U000104a5",
"\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
"\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
"\\u0027\\u11af\\U000e0057\\u0602",
"\\U0001d7f2\\U000e007\\u0004\\u0589",
"\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
"\\U0001d7f2\\U000e007d\\u0004\\u0589",
"\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
"\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
"\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
"\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
"\\u0233\\U000e0020\\u0a69\\u0d6a",
"\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
"\\u58f4\\U000e0049\\u20e7\\u2027",
"\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
"\\ua183\\u102d\\u0bec\\u003a",
"\\u17e8\\u06e7\\u002e\\u096d\\u003b",
"\\u003a\\u0e57\\u0fad\\u002e",
"\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
"\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
"\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
"\\u003a\\u0664\\u00b7\\u1fba",
"\\u003b\\u0027\\u00b7\\u47a3",
};
int loop;
if (U_FAILURE(status)) {
errln("Creation of break iterator failed %s", u_errorName(status));
return;
}
for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
u_unescape(strlist[loop], str, 20);
UnicodeString ustr(str);
int forward[50];
int count = 0;
bi->setText(ustr);
int prev = 0;
int i;
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count ++] = i;
if (i > prev) {
int j;
for (j = prev + 1; j < i; j ++) {
if (bi->isBoundary(j)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d not a boundary",
j);
return;
}
}
}
if (!bi->isBoundary(i)) {
printStringBreaks(ustr, forward, count);
errln("happy boundary test failed: expected %d a boundary",
i);
return;
}
prev = i;
}
}
delete bi;
}
void RBBITest::TestLineBreaks(void)
{
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
UChar str[50];
static const char *strlist[] =
{
"\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
"\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
"\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
"\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
"\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
"\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
"\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
"\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
"\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
"\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
"\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
"\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
"\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
"\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
"\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
"\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
"\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
"\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
"\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
"\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
"\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
"\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
"\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
"\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
"\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
"\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
"\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
"\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
"\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
"\\u2014\\u0020\\u000a\\u17c5\\u24fc",
"\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
"\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
"\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
"\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
"\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
"\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
};
int loop;
if (U_FAILURE(status)) {
errln("Creation of break iterator failed %s", u_errorName(status));
return;
}
for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
u_unescape(strlist[loop], str, 20);
UnicodeString ustr(str);
RBBILineMonkey monkey;
int expected[50];
int expectedcount = 0;
monkey.setText(ustr);
int i;
for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
expected[expectedcount ++] = i;
}
testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
}
delete bi;
#endif
}
void RBBITest::TestSentBreaks(void)
{
Locale locale("en");
UErrorCode status = U_ZERO_ERROR;
BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
UChar str[100];
static const char *strlist[] =
{
"Now\ris\nthe\r\ntime\n\rfor\r\r",
"This\n",
"Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
"\"Sentence ending with a quote.\" Bye.",
" (This is it). Testing the sentence iterator. \"This isn't it.\"",
"Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
"Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
"Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
"Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
"Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
};
int loop;
int forward[100];
if (U_FAILURE(status)) {
errln("Creation of break iterator failed %s", u_errorName(status));
return;
}
for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
u_unescape(strlist[loop], str, 100);
UnicodeString ustr(str);
int count = 0;
bi->setText(ustr);
int i;
for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
forward[count ++] = i;
}
testBreakBoundPreceding(this, ustr, bi, forward, count);
}
delete bi;
}
void RBBITest::TestMonkey(char *params) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
UErrorCode status = U_ZERO_ERROR;
int32_t loopCount = 500;
int32_t seed = 1;
UnicodeString breakType = "all";
Locale locale("en");
if (quick == FALSE) {
loopCount = 10000;
}
if (params) {
UnicodeString p(params);
loopCount = getIntParam("loop", p, loopCount);
seed = getIntParam("seed", p, seed);
RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
if (m.find()) {
breakType = m.group(1, status);
m.reset();
p = m.replaceFirst("", status);
}
m.reset(p);
if (RegexMatcher("\\S", p, 0, status).find()) {
char buf[100];
p.extract(buf, sizeof(buf), NULL, status);
buf[sizeof(buf)-1] = 0;
errln("Unrecognized or extra parameter: %s\n", buf);
return;
}
}
if (breakType == "char" || breakType == "all") {
RBBICharMonkey m;
BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "char", seed, loopCount);
}
else {
errln("Creation of character break iterator failed %s", u_errorName(status));
}
delete bi;
}
if (breakType == "word" || breakType == "all") {
logln("Word Break Monkey Test");
RBBIWordMonkey m;
BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "word", seed, loopCount);
}
else {
errln("Creation of word break iterator failed %s", u_errorName(status));
}
delete bi;
}
if (breakType == "line" || breakType == "all") {
logln("Line Break Monkey Test");
RBBILineMonkey m;
BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
if (params == NULL) {
loopCount = 50;
}
if (U_SUCCESS(status)) {
RunMonkey(bi, m, "line", seed, loopCount);
}
else {
errln("Creation of line break iterator failed %s", u_errorName(status));
}
delete bi;
}
#endif
}
void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, int32_t numIterations) {
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
const int32_t TESTSTRINGLEN = 500;
UnicodeString testText;
int32_t numCharClasses;
UVector *chClasses;
int expected[TESTSTRINGLEN*2 + 1];
int expectedCount = 0;
char expectedBreaks[TESTSTRINGLEN*2 + 1];
char forwardBreaks[TESTSTRINGLEN*2 + 1];
char reverseBreaks[TESTSTRINGLEN*2+1];
char isBoundaryBreaks[TESTSTRINGLEN*2+1];
char followingBreaks[TESTSTRINGLEN*2+1];
char precedingBreaks[TESTSTRINGLEN*2+1];
int i;
int loopCount = 0;
m_seed = seed;
numCharClasses = mk.charClasses()->size();
chClasses = mk.charClasses();
if (U_FAILURE(mk.deferredStatus)) {
errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
return;
}
for (i=0; i<numCharClasses; i++) {
UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
if (s == NULL || s->size() == 0) {
errln("Character Class #%d is null or of zero size.", i);
return;
}
}
while (loopCount < numIterations || numIterations == -1) {
if (numIterations == -1 && loopCount % 10 == 0) {
fprintf(stderr, ".");
}
seed = m_seed;
testText.truncate(0);
for (i=0; i<TESTSTRINGLEN; i++) {
int32_t aClassNum = m_rand() % numCharClasses;
UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
int32_t charIdx = m_rand() % classSet->size();
UChar32 c = classSet->charAt(charIdx);
if (c < 0) { errln("c < 0");
}
testText.append(c);
}
mk.setText(testText);
memset(expectedBreaks, 0, sizeof(expectedBreaks));
expectedBreaks[0] = 1;
int32_t breakPos = 0;
expectedCount = 0;
for (;;) {
breakPos = mk.next(breakPos);
if (breakPos == -1) {
break;
}
if (breakPos > testText.length()) {
errln("breakPos > testText.length()");
}
expectedBreaks[breakPos] = 1;
expected[expectedCount ++] = breakPos;
}
memset(forwardBreaks, 0, sizeof(forwardBreaks));
bi->setText(testText);
for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
if (i < 0 || i > testText.length()) {
errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
break;
}
forwardBreaks[i] = 1;
}
memset(reverseBreaks, 0, sizeof(reverseBreaks));
for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
if (i < 0 || i > testText.length()) {
errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
break;
}
reverseBreaks[i] = 1;
}
memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
U_ASSERT(sizeof(isBoundaryBreaks) > testText.length());
for (i=0; i<=testText.length(); i++) {
isBoundaryBreaks[i] = bi->isBoundary(i);
}
memset(followingBreaks, 0, sizeof(followingBreaks));
int32_t lastBreakPos = 0;
followingBreaks[0] = 1;
for (i=0; i<testText.length(); i++) {
breakPos = bi->following(i);
if (breakPos <= i ||
breakPos < lastBreakPos ||
breakPos > testText.length() ||
breakPos > lastBreakPos && lastBreakPos > i ) {
errln("%s break monkey test: "
"Out of range value returned by BreakIterator::following().\n"
"Random seed=%d", name, seed);
break;
}
followingBreaks[breakPos] = 1;
lastBreakPos = breakPos;
}
memset(precedingBreaks, 0, sizeof(followingBreaks));
lastBreakPos = testText.length();
precedingBreaks[testText.length()] = 1;
for (i=testText.length(); i>0; i--) {
breakPos = bi->preceding(i);
if (breakPos >= i ||
breakPos > lastBreakPos ||
breakPos < 0 ||
breakPos < lastBreakPos && lastBreakPos < i ) {
errln("%s break monkey test: "
"Out of range value returned by BreakIterator::preceding().\n"
"index=%d; prev returned %d; lastBreak=%d" ,
name, i, breakPos, lastBreakPos);
precedingBreaks[i] = 2; } else {
precedingBreaks[breakPos] = 1;
lastBreakPos = breakPos;
}
}
for (i=0; i<=testText.length(); i++) {
const char *errorType = NULL;
if (forwardBreaks[i] != expectedBreaks[i]) {
errorType = "next()";
} else if (reverseBreaks[i] != forwardBreaks[i]) {
errorType = "previous()";
} else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
errorType = "isBoundary()";
} else if (followingBreaks[i] != expectedBreaks[i]) {
errorType = "following()";
} else if (precedingBreaks[i] != expectedBreaks[i]) {
errorType = "preceding()";
}
if (errorType != NULL) {
int startContext = i;
int32_t count = 0;
for (;;) {
if (startContext==0) { break; }
startContext --;
if (expectedBreaks[startContext] != 0) {
if (count == 2) break;
count ++;
}
}
int endContext = i + 1;
int ci;
for (ci=0; ci<2; ci++) { for (;;) {
if (endContext >= testText.length()) {break;}
if (expectedBreaks[endContext-1] != 0) {
if (count == 0) break;
count --;
}
endContext ++;
}
}
UnicodeString errorText = "<data>";
for (ci=startContext; ci<endContext;) {
UnicodeString hexChars("0123456789abcdef");
UChar32 c;
int bn;
c = testText.char32At(ci);
if (ci == i) {
errorText.append("<?>");
} else if (expectedBreaks[ci] != 0) {
errorText.append("<>");
}
if (c < 0x10000) {
errorText.append("\\u");
for (bn=12; bn>=0; bn-=4) {
errorText.append(hexChars.charAt((c>>bn)&0xf));
}
} else {
errorText.append("\\U");
for (bn=28; bn>=0; bn-=4) {
errorText.append(hexChars.charAt((c>>bn)&0xf));
}
}
ci = testText.moveIndex32(ci, 1);
}
errorText.append("<>");
errorText.append("</data>\n");
char charErrorTxt[500];
UErrorCode status = U_ZERO_ERROR;
errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
charErrorTxt[sizeof(charErrorTxt)-1] = 0;
errln("%s break monkey test error. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
errorType, seed, i, charErrorTxt);
break;
}
}
loopCount++;
}
#endif
}
#endif