#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "unormimp.h"
#include "unicode/caniter.h"
#include "unicode/normlzr.h"
#include "unicode/uchar.h"
#include "cmemory.h"
#if 0
static UBool PROGRESS = FALSE;
#include <stdio.h>
#include "unicode/translit.h"
UErrorCode status = U_ZERO_ERROR;
static const char* UToS(const UnicodeString &source) {
static char buffer[256];
buffer[source.extract(0, source.length(), buffer)] = 0;
return buffer;
}
static const UnicodeString &Tr(const UnicodeString &source) {
static Transliterator *NAME = Transliterator::createInstance("name", UTRANS_FORWARD, status);
static UnicodeString result;
result = source;
NAME->transliterate(result);
return result;
}
#endif
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CanonicalIterator)
CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode &status) :
pieces(NULL),
pieces_length(0),
pieces_lengths(NULL),
current(NULL),
current_length(0)
{
if(U_SUCCESS(status)) {
setSource(sourceStr, status);
}
}
CanonicalIterator::~CanonicalIterator() {
cleanPieces();
}
void CanonicalIterator::cleanPieces() {
int32_t i = 0;
if(pieces != NULL) {
for(i = 0; i < pieces_length; i++) {
if(pieces[i] != NULL) {
delete[] pieces[i];
}
}
uprv_free(pieces);
pieces = NULL;
if(pieces_lengths != NULL) {
uprv_free(pieces_lengths);
}
pieces_lengths = NULL;
if(current != NULL) {
uprv_free(current);
}
current = NULL;
}
}
UnicodeString CanonicalIterator::getSource() {
return source;
}
void CanonicalIterator::reset() {
done = FALSE;
for (int i = 0; i < current_length; ++i) {
current[i] = 0;
}
}
UnicodeString CanonicalIterator::next() {
int32_t i = 0;
if (done) {
buffer.setToBogus();
return buffer;
}
buffer.remove();
for (i = 0; i < pieces_length; ++i) {
buffer.append(pieces[i][current[i]]);
}
for (i = current_length - 1; ; --i) {
if (i < 0) {
done = TRUE;
break;
}
current[i]++;
if (current[i] < pieces_lengths[i]) break; current[i] = 0;
}
return buffer;
}
void CanonicalIterator::setSource(const UnicodeString &newSource, UErrorCode &status) {
Normalizer::normalize(newSource, UNORM_NFD, 0, source, status);
if(U_FAILURE(status)) {
return;
}
done = FALSE;
cleanPieces();
if (newSource.length() == 0) {
pieces_length = 1;
pieces = (UnicodeString **)uprv_malloc(sizeof(UnicodeString *));
if (pieces == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
current_length = 1;
current = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
if (current == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(pieces);
pieces = NULL;
return;
}
current[0] = 0;
pieces[0] = new UnicodeString[1];
if (pieces[0] == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(pieces);
pieces = NULL;
uprv_free(current);
return;
}
pieces[0][0] = UnicodeString();
pieces_lengths = (int32_t*)uprv_malloc(1 * sizeof(int32_t));
if (pieces_lengths == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(pieces);
pieces = NULL;
uprv_free(current);
return;
}
pieces_lengths[0] = 1;
return;
}
UnicodeString *list = new UnicodeString[source.length()];
if (list == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t list_length = 0;
UChar32 cp = 0;
int32_t start = 0;
int32_t i = UTF16_CHAR_LENGTH(source.char32At(0));
for (; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
cp = source.char32At(i);
if (unorm_isCanonSafeStart(cp)) {
source.extract(start, i-start, list[list_length++]); start = i;
}
}
source.extract(start, i-start, list[list_length++]);
pieces = (UnicodeString **)uprv_malloc(list_length * sizeof(UnicodeString *));
if (pieces == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
delete[] list;
return;
}
pieces_length = list_length;
pieces_lengths = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
if (pieces_lengths == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete[] list;
uprv_free(pieces);
pieces = NULL;
return;
}
current_length = list_length;
current = (int32_t*)uprv_malloc(list_length * sizeof(int32_t));
if (current == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete[] list;
uprv_free(pieces);
pieces = NULL;
uprv_free(pieces_lengths);
return;
}
for (i = 0; i < current_length; i++) {
current[i] = 0;
}
for (i = 0; i < pieces_length; ++i) {
pieces[i] = getEquivalents(list[i], pieces_lengths[i], status);
}
delete[] list;
}
void U_EXPORT2 CanonicalIterator::permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status) {
if(U_FAILURE(status)) {
return;
}
int32_t i = 0;
if (source.length() <= 2 && source.countChar32() <= 1) {
UnicodeString *toPut = new UnicodeString(source);
if (toPut == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
result->put(source, toPut, status);
return;
}
UChar32 cp;
Hashtable *subpermute = new Hashtable(status);
if (subpermute == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
if (U_SUCCESS(status)) {
subpermute->setValueDeleter(uhash_deleteUnicodeString);
}
for (i = 0; i < source.length(); i += UTF16_CHAR_LENGTH(cp)) {
cp = source.char32At(i);
const UHashElement *ne = NULL;
int32_t el = -1;
UnicodeString subPermuteString = source;
if (skipZeros && i != 0 && u_getCombiningClass(cp) == 0) {
continue;
}
subpermute->removeAll();
permute(subPermuteString.replace(i, UTF16_CHAR_LENGTH(cp), NULL, 0), skipZeros, subpermute, status);
if(U_FAILURE(status)) {
delete subpermute;
return;
}
ne = subpermute->nextElement(el);
while (ne != NULL) {
UnicodeString *permRes = (UnicodeString *)(ne->value.pointer);
UnicodeString *chStr = new UnicodeString(cp);
if (chStr == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
delete subpermute;
return;
}
chStr->append(*permRes); result->put(*chStr, chStr, status);
ne = subpermute->nextElement(el);
}
}
delete subpermute;
}
UnicodeString* CanonicalIterator::getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status) {
Hashtable *result = new Hashtable(status);
if (result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_SUCCESS(status)) {
result->setValueDeleter(uhash_deleteUnicodeString);
}
UChar USeg[256];
int32_t segLen = segment.extract(USeg, 256, status);
Hashtable *basic = getEquivalents2(USeg, segLen, status);
Hashtable *permutations = new Hashtable(status);
if (permutations == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete result;
delete basic;
return 0;
}
if (U_SUCCESS(status)) {
permutations->setValueDeleter(uhash_deleteUnicodeString);
}
const UHashElement *ne = NULL;
int32_t el = -1;
ne = basic->nextElement(el);
while (ne != NULL) {
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
permutations->removeAll();
permute(item, CANITER_SKIP_ZEROES, permutations, status);
const UHashElement *ne2 = NULL;
int32_t el2 = -1;
ne2 = permutations->nextElement(el2);
while (ne2 != NULL) {
UnicodeString possible(*((UnicodeString *)(ne2->value.pointer)));
UnicodeString attempt;
Normalizer::normalize(possible, UNORM_NFD, 0, attempt, status);
if (attempt==segment) {
result->put(possible, new UnicodeString(possible), status); } else {
}
ne2 = permutations->nextElement(el2);
}
ne = basic->nextElement(el);
}
if(U_FAILURE(status)) {
delete result;
delete permutations;
delete basic;
return 0;
}
UnicodeString *finalResult = NULL;
int32_t resultCount;
if((resultCount = result->count())) {
finalResult = new UnicodeString[resultCount];
} else {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
if (finalResult == 0) {
if(U_SUCCESS(status)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
delete result;
delete permutations;
delete basic;
return 0;
}
result_len = 0;
el = -1;
ne = result->nextElement(el);
while(ne != NULL) {
UnicodeString finResult = *((UnicodeString *)(ne->value.pointer));
finalResult[result_len++] = finResult;
ne = result->nextElement(el);
}
delete permutations;
delete basic;
delete result;
return finalResult;
}
Hashtable *CanonicalIterator::getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status) {
Hashtable *result = new Hashtable(status);
if (result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_SUCCESS(status)) {
result->setValueDeleter(uhash_deleteUnicodeString);
}
UnicodeString toPut(segment, segLen);
result->put(toPut, new UnicodeString(toPut), status);
USerializedSet starts;
UChar32 cp, end = 0;
int32_t i = 0, j;
for (i = 0; i < segLen; i += UTF16_CHAR_LENGTH(cp)) {
UTF_GET_CHAR(segment, 0, i, segLen, cp);
if (!unorm_getCanonStartSet(cp, &starts)) {
continue;
}
for(j = 0, cp = end+1; cp <= end || uset_getSerializedRange(&starts, j++, &cp, &end); ++cp) {
Hashtable *remainder = extract(cp, segment, segLen, i, status);
if (remainder == NULL) continue;
UnicodeString prefix(segment, i);
prefix += cp;
const UHashElement *ne = NULL;
int32_t el = -1;
ne = remainder->nextElement(el);
while (ne != NULL) {
UnicodeString item = *((UnicodeString *)(ne->value.pointer));
UnicodeString *toAdd = new UnicodeString(prefix);
if (toAdd == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
delete result;
delete remainder;
return 0;
}
*toAdd += item;
result->put(*toAdd, toAdd, status);
ne = remainder->nextElement(el);
}
delete remainder;
}
}
if(U_FAILURE(status)) {
return 0;
}
return result;
}
Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
const int32_t bufSize = 256;
int32_t bufLen = 0;
UChar temp[bufSize];
int32_t inputLen = 0, decompLen;
UChar stackBuffer[4];
const UChar *decomp;
U16_APPEND_UNSAFE(temp, inputLen, comp);
decomp = unorm_getCanonicalDecomposition(comp, stackBuffer, &decompLen);
if(decomp == NULL) {
stackBuffer[0] = temp[0];
if(inputLen > 1) {
stackBuffer[1] = temp[1];
}
decomp = stackBuffer;
decompLen = inputLen;
}
UChar *buff = temp+inputLen;
UBool ok = FALSE;
UChar32 cp;
int32_t decompPos = 0;
UChar32 decompCp;
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
int32_t i;
UBool overflow = FALSE;
i = segmentPos;
while(i < segLen) {
UTF_NEXT_CHAR(segment, i, segLen, cp);
if (cp == decompCp) {
if (decompPos == decompLen) { uprv_memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));
bufLen+=segLen-i;
ok = TRUE;
break;
}
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
} else {
U16_APPEND(buff, bufLen, bufSize, cp, overflow);
if(overflow) {
overflow = FALSE;
}
}
}
if (!ok) return NULL;
if (bufLen == 0) {
Hashtable *result = new Hashtable(status);
if (result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
result->setValueDeleter(uhash_deleteUnicodeString);
result->put(UnicodeString(), new UnicodeString(), status);
return result; }
int32_t tempLen = inputLen + bufLen;
UChar trial[bufSize];
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status);
if(U_FAILURE(status)) {
return 0;
}
if(uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0) {
return NULL;
}
return getEquivalents2(buff, bufLen, status);
}
U_NAMESPACE_END
#endif