#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "unormimp.h"
#include "unicode/caniter.h"
#include "unicode/normlzr.h"
#include "unicode/uchar.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
pieces(NULL),
pieces_length(0),
pieces_lengths(NULL),
current(NULL),
current_length(0)
{
if(U_SUCCESS(status)) {
setSource(sourceStr, status);
}
}
CanonicalIterator::~CanonicalIterator() {
cleanPieces();
}
void CanonicalIterator::cleanPieces() {
int32_t i = 0;
if(pieces != NULL) {
for(i = 0; i < pieces_length; i++) {
if(pieces[i] != NULL) {
delete[] pieces[i];
}
}
uprv_free(pieces);
pieces = NULL;
pieces_length = 0;
}
if(pieces_lengths != NULL) {
uprv_free(pieces_lengths);
pieces_lengths = NULL;
}
if(current != NULL) {
uprv_free(current);
current = NULL;
current_length = 0;
}
}
UnicodeString CanonicalIterator::getSource() {
return source;
}
void CanonicalIterator::reset() {
done = FALSE;
for (int i = 0; i < current_length; ++i) {
current[i] = 0;
}
}
UnicodeString CanonicalIterator::next() {
int32_t i = 0;
if (done) {
buffer.setToBogus();
return buffer;
}
buffer.remove();
for (i = 0; i < pieces_length; ++i) {
buffer.append(pieces[i][current[i]]);
}
for (i = current_length - 1; ; --i) {
if (i < 0) {
done = TRUE;
break;
}
current[i]++;
if (current[i] < pieces_lengths[i]) break; current[i] = 0;
}
return buffer;
}
void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
int32_t list_length = 0;
UChar32 cp = 0;
int32_t start = 0;
int32_t i = 0;
UnicodeString *list = NULL;
Normalizer::normalize(newSource, UNORM_NFD, 0, source, status);
if(U_FAILURE(status)) {
return;
}
done = FALSE;
cleanPieces();
if (newSource.length() == 0) {
pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
pieces_length = 1;
current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
current_length = 1;
if (pieces == NULL || pieces_lengths == NULL || current == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
current[0] = 0;
pieces[0] = new UnicodeString[1];
pieces_lengths[0] = 1;
if (pieces[0] == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
return;
}
list = new UnicodeString[source.length()];
if (list == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
i = UTF16_CHAR_LENGTH(source.char32At(0));
for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
cp = source.char32At(i);
if (unorm_isCanonSafeStart(cp)) {
source.extract(start, i-start, list[list_length++]); start = i;
}
}
source.extract(start, i-start, list[list_length++]);
pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
pieces_length = list_length;
pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
current_length = list_length;
if (pieces == NULL || pieces_lengths == NULL || current == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto CleanPartialInitialization;
}
for (i = 0; i < current_length; i++) {
current[i] = 0;
}
for (i = 0; i < pieces_length; ++i) {
pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
}
delete[] list;
return;
CleanPartialInitialization:
if (list != NULL) {
delete[] list;
}
cleanPieces();
}
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
if(U_FAILURE(status)) {
return;
}
int32_t i = 0;
if (source.length() <= 2 && source.countChar32() <= 1) {
UnicodeString *toPut = new UnicodeString(source);
if (toPut == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
result->put(source, toPut, status);
return;
}
UChar32 cp;
Hashtable subpermute(status);
if(U_FAILURE(status)) {
return;
}
subpermute.setValueDeleter(uhash_deleteUnicodeString);
for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
cp = source.char32At(i);
const UHashElement *ne = NULL;
int32_t el = -1;
UnicodeString subPermuteString = source;
if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
continue;
}
subpermute.removeAll();
permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, &subpermute, status);
if(U_FAILURE(status)) {
return;
}
ne = subpermute.nextElement(el);
while (ne != NULL) {
UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
UnicodeString *chStr = new UnicodeString(cp);
if (chStr == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
chStr->append(*permRes); result->put(*chStr, chStr, status);
ne = subpermute.nextElement(el);
}
}
}
UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
Hashtable result(status);
Hashtable permutations(status);
Hashtable basic(status);
if (U_FAILURE(status)) {
return 0;
}
result.setValueDeleter(uhash_deleteUnicodeString);
permutations.setValueDeleter(uhash_deleteUnicodeString);
basic.setValueDeleter(uhash_deleteUnicodeString);
UChar USeg[256];
int32_t segLen = segment.extract(USeg, 256, status);
getEquivalents2(&basic, USeg, segLen, status);
const UHashElement *ne = NULL;
int32_t el = -1;
ne = basic.nextElement(el);
while (ne != NULL) {
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
permutations.removeAll();
permute(item, CANITER_SKIP_ZEROES, &permutations, status);
const UHashElement *ne2 = NULL;
int32_t el2 = -1;
ne2 = permutations.nextElement(el2);
while (ne2 != NULL) {
UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString attempt;
Normalizer::normalize(possible, UNORM_NFD, 0, attempt, status);
if (attempt==segment) {
result.put(possible, new UnicodeString(possible), status); } else {
}
ne2 = permutations.nextElement(el2);
}
ne = basic.nextElement(el);
}
if(U_FAILURE(status)) {
return 0;
}
UnicodeString *finalResult = NULL;
int32_t resultCount;
if((resultCount = result.count())) {
finalResult = new UnicodeString[resultCount];
if (finalResult == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
}
else {
status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
result_len = 0;
el = -1;
ne = result.nextElement(el);
while(ne != NULL) {
finalResult[result_len++] = *((UnicodeString *)(ne->value.pointer));
ne = result.nextElement(el);
}
return finalResult;
}
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
UnicodeString toPut(segment, segLen);
fillinResult->put(toPut, new UnicodeString(toPut), status);
USerializedSet starts;
UChar32 cp, end = 0;
int32_t i = 0, j;
for (i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) {
UTF_GET_CHAR(segment, 0, i, segLen, cp);
if (!unorm_getCanonStartSet(cp, &starts)) {
continue;
}
for(j = 0, cp = end+1; cp <= end || uset_getSerializedRange(&starts, j++, &cp, &end); ++cp) {
Hashtable remainder(status);
remainder.setValueDeleter(uhash_deleteUnicodeString);
if (extract(&remainder, cp, segment, segLen, i, status) == NULL) {
continue;
}
UnicodeString prefix(segment, i);
prefix += cp;
int32_t el = -1;
const UHashElement *ne = remainder.nextElement(el);
while (ne != NULL) {
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
UnicodeString *toAdd = new UnicodeString(prefix);
if (toAdd == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
*toAdd += item;
fillinResult->put(*toAdd, toAdd, status);
ne = remainder.nextElement(el);
}
}
}
if(U_FAILURE(status)) {
return NULL;
}
return fillinResult;
}
Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
const int32_t bufSize = 256;
int32_t bufLen = 0;
UChar temp[bufSize];
int32_t inputLen = 0, decompLen;
UChar stackBuffer[4];
const UChar *decomp;
U16_APPEND_UNSAFE(temp, inputLen, comp);
decomp = unorm_getCanonicalDecomposition(comp, stackBuffer, &decompLen);
if(decomp == NULL) {
stackBuffer[0] = temp[0];
if(inputLen > 1) {
stackBuffer[1] = temp[1];
}
decomp = stackBuffer;
decompLen = inputLen;
}
UChar *buff = temp+inputLen;
UBool ok = FALSE;
UChar32 cp;
int32_t decompPos = 0;
UChar32 decompCp;
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
int32_t i;
UBool overflow = FALSE;
i = segmentPos;
while(i < segLen) {
UTF_NEXT_CHAR(segment, i, segLen, cp);
if (cp == decompCp) {
if (decompPos == decompLen) { uprv_memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));
bufLen+=segLen-i;
ok = TRUE;
break;
}
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
} else {
U16_APPEND(buff, bufLen, bufSize, cp, overflow);
if(overflow) {
overflow = FALSE;
}
}
}
if (!ok)
return NULL;
if (bufLen == 0) {
fillinResult->put(UnicodeString(), new UnicodeString(), status);
return fillinResult; }
int32_t tempLen = inputLen + bufLen;
UChar trial[bufSize];
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status);
if(U_FAILURE(status)
|| uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0)
{
return NULL;
}
return getEquivalents2(fillinResult, buff, bufLen, status);
}
U_NAMESPACE_END
#endif