norms.cpp   [plain text]


// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

// norms.cpp
// created: 2017jun04 Markus W. Scherer
// (pulled out of n2builder.cpp)

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include <stdio.h>
#include <stdlib.h>
#include "unicode/errorcode.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "normalizer2impl.h"
#include "norms.h"
#include "toolutil.h"
#include "utrie2.h"
#include "uvectr32.h"

U_NAMESPACE_BEGIN

void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
    if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
        if(cc==0) {
            fLastStarterIndex=fLength;
        }
        fArray[fLength++]=(c<<8)|cc;
        return;
    }
    // Let this character bubble back to its canonical order.
    int32_t i=fLength-1;
    while(i>fLastStarterIndex && ccAt(i)>cc) {
        --i;
    }
    ++i;  // after the last starter or prevCC<=cc
    // Move this and the following characters forward one to make space.
    for(int32_t j=fLength; i<j; --j) {
        fArray[j]=fArray[j-1];
    }
    fArray[i]=(c<<8)|cc;
    ++fLength;
    fDidReorder=TRUE;
}

void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
    dest.remove();
    for(int32_t i=0; i<fLength; ++i) {
        dest.append(charAt(i));
    }
}

UChar32 Norm::combine(UChar32 trail) const {
    int32_t length;
    const CompositionPair *pairs=getCompositionPairs(length);
    for(int32_t i=0; i<length; ++i) {
        if(trail==pairs[i].trail) {
            return pairs[i].composite;
        }
        if(trail<pairs[i].trail) {
            break;
        }
    }
    return U_SENTINEL;
}

Norms::Norms(UErrorCode &errorCode) {
    normTrie=utrie2_open(0, 0, &errorCode);
    normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
    // Default "inert" Norm struct at index 0. Practically immutable.
    norms=allocNorm();
    norms->type=Norm::INERT;
}

Norms::~Norms() {
    utrie2_close(normTrie);
    int32_t normsLength=utm_countItems(normMem);
    for(int32_t i=1; i<normsLength; ++i) {
        delete norms[i].mapping;
        delete norms[i].rawMapping;
        delete norms[i].compositions;
    }
    utm_close(normMem);
}

Norm *Norms::allocNorm() {
    Norm *p=(Norm *)utm_alloc(normMem);
    norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
    return p;
}

Norm *Norms::getNorm(UChar32 c) {
    uint32_t i=utrie2_get32(normTrie, c);
    if(i==0) {
        return nullptr;
    }
    return norms+i;
}

const Norm *Norms::getNorm(UChar32 c) const {
    uint32_t i=utrie2_get32(normTrie, c);
    if(i==0) {
        return nullptr;
    }
    return norms+i;
}

const Norm &Norms::getNormRef(UChar32 c) const {
    return norms[utrie2_get32(normTrie, c)];
}

Norm *Norms::createNorm(UChar32 c) {
    uint32_t i=utrie2_get32(normTrie, c);
    if(i!=0) {
        return norms+i;
    } else {
        /* allocate Norm */
        Norm *p=allocNorm();
        IcuToolErrorCode errorCode("gennorm2/createNorm()");
        utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
        return p;
    }
}

void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
    int32_t length=mapping.length();
    U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
    const char16_t *s=mapping.getBuffer();
    int32_t i=0;
    UChar32 c;
    while(i<length) {
        U16_NEXT(s, i, length, c);
        buffer.append(c, getCC(c));
    }
    if(buffer.didReorder()) {
        buffer.toString(mapping);
    }
}

UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
    if((highCC-lowCC)>=2) {
        int32_t length;
        const CompositionPair *pairs=norm.getCompositionPairs(length);
        for(int32_t i=0; i<length; ++i) {
            uint8_t trailCC=getCC(pairs[i].trail);
            if(lowCC<trailCC && trailCC<highCC) {
                return TRUE;
            }
        }
    }
    return FALSE;
}

U_CDECL_BEGIN

static UBool U_CALLCONV
enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    return ((Norms::Enumerator *)context)->rangeHandler(start, end, value);
}

U_CDECL_END

void Norms::enumRanges(Enumerator &e) {
    utrie2_enum(normTrie, nullptr, enumRangeHandler, &e);
}

Norms::Enumerator::~Enumerator() {}

UBool Norms::Enumerator::rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
    if(value!=0) {
        rangeHandler(start, end, norms.getNormRefByIndex(value));
    }
    return TRUE;
}

void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
    if(start!=end) {
        fprintf(stderr,
                "gennorm2 error: same round-trip mapping for "
                "more than 1 code point U+%04lX..U+%04lX\n",
                (long)start, (long)end);
        exit(U_INVALID_FORMAT_ERROR);
    }
    if(norm.cc!=0) {
        fprintf(stderr,
                "gennorm2 error: "
                "U+%04lX has a round-trip mapping and ccc!=0, "
                "not possible in Unicode normalization\n",
                (long)start);
        exit(U_INVALID_FORMAT_ERROR);
    }
    // setRoundTripMapping() ensured that there are exactly two code points.
    const UnicodeString &m=*norm.mapping;
    UChar32 lead=m.char32At(0);
    UChar32 trail=m.char32At(m.length()-1);
    if(norms.getCC(lead)!=0) {
        fprintf(stderr,
                "gennorm2 error: "
                "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
                "not possible in Unicode normalization\n",
                (long)start, (long)lead);
        exit(U_INVALID_FORMAT_ERROR);
    }
    // Flag for trailing character.
    norms.createNorm(trail)->combinesBack=TRUE;
    // Insert (trail, composite) pair into compositions list for the lead character.
    IcuToolErrorCode errorCode("gennorm2/addComposition()");
    Norm *leadNorm=norms.createNorm(lead);
    UVector32 *compositions=leadNorm->compositions;
    int32_t i;
    if(compositions==nullptr) {
        compositions=leadNorm->compositions=new UVector32(errorCode);
        i=0;  // "insert" the first pair at index 0
    } else {
        // Insertion sort, and check for duplicate trail characters.
        int32_t length;
        const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
        for(i=0; i<length; ++i) {
            if(trail==pairs[i].trail) {
                fprintf(stderr,
                        "gennorm2 error: same round-trip mapping for "
                        "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
                        (long)start, (long)lead, (long)trail);
                exit(U_INVALID_FORMAT_ERROR);
            }
            if(trail<pairs[i].trail) {
                break;
            }
        }
    }
    compositions->insertElementAt(trail, 2*i, errorCode);
    compositions->insertElementAt(start, 2*i+1, errorCode);
}

void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    if(!norm.hasMapping()) { return; }
    const UnicodeString &m=*norm.mapping;
    UnicodeString *decomposed=nullptr;
    const UChar *s=toUCharPtr(m.getBuffer());
    int32_t length=m.length();
    int32_t prev, i=0;
    UChar32 c;
    while(i<length) {
        prev=i;
        U16_NEXT(s, i, length, c);
        if(start<=c && c<=end) {
            fprintf(stderr,
                    "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
                    (long)c);
            exit(U_INVALID_FORMAT_ERROR);
        }
        const Norm &cNorm=norms.getNormRef(c);
        if(cNorm.hasMapping()) {
            if(norm.mappingType==Norm::ROUND_TRIP) {
                if(prev==0) {
                    if(cNorm.mappingType!=Norm::ROUND_TRIP) {
                        fprintf(stderr,
                                "gennorm2 error: "
                                "U+%04lX's round-trip mapping's starter "
                                "U+%04lX one-way-decomposes, "
                                "not possible in Unicode normalization\n",
                                (long)start, (long)c);
                        exit(U_INVALID_FORMAT_ERROR);
                    }
                    uint8_t myTrailCC=norms.getCC(m.char32At(i));
                    UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
                    uint8_t cTrailCC=norms.getCC(cTrailChar);
                    if(cTrailCC>myTrailCC) {
                        fprintf(stderr,
                                "gennorm2 error: "
                                "U+%04lX's round-trip mapping's starter "
                                "U+%04lX decomposes and the "
                                "inner/earlier tccc=%hu > outer/following tccc=%hu, "
                                "not possible in Unicode normalization\n",
                                (long)start, (long)c,
                                (short)cTrailCC, (short)myTrailCC);
                        exit(U_INVALID_FORMAT_ERROR);
                    }
                } else {
                    fprintf(stderr,
                            "gennorm2 error: "
                            "U+%04lX's round-trip mapping's non-starter "
                            "U+%04lX decomposes, "
                            "not possible in Unicode normalization\n",
                            (long)start, (long)c);
                    exit(U_INVALID_FORMAT_ERROR);
                }
            }
            if(decomposed==nullptr) {
                decomposed=new UnicodeString(m, 0, prev);
            }
            decomposed->append(*cNorm.mapping);
        } else if(Hangul::isHangul(c)) {
            UChar buffer[3];
            int32_t hangulLength=Hangul::decompose(c, buffer);
            if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
                fprintf(stderr,
                        "gennorm2 error: "
                        "U+%04lX's round-trip mapping's non-starter "
                        "U+%04lX decomposes, "
                        "not possible in Unicode normalization\n",
                        (long)start, (long)c);
                exit(U_INVALID_FORMAT_ERROR);
            }
            if(decomposed==nullptr) {
                decomposed=new UnicodeString(m, 0, prev);
            }
            decomposed->append(buffer, hangulLength);
        } else if(decomposed!=nullptr) {
            decomposed->append(m, prev, i-prev);
        }
    }
    if(decomposed!=nullptr) {
        if(norm.rawMapping==nullptr) {
            // Remember the original mapping when decomposing recursively.
            norm.rawMapping=norm.mapping;
        } else {
            delete norm.mapping;
        }
        norm.mapping=decomposed;
        // Not  norm.setMappingCP();  because the original mapping
        // is most likely to be encodable as a delta.
        didDecompose|=TRUE;
    }
}

U_NAMESPACE_END

#endif // #if !UCONFIG_NO_NORMALIZATION