extradata.cpp   [plain text]


// © 2017 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

// extradata.cpp
// created: 2017jun04 Markus W. Scherer
// (pulled out of n2builder.cpp)

#include "unicode/utypes.h"

#if !UCONFIG_NO_NORMALIZATION

#include <stdio.h>
#include <stdlib.h>
#include "unicode/errorcode.h"
#include "unicode/unistr.h"
#include "unicode/utf16.h"
#include "extradata.h"
#include "normalizer2impl.h"
#include "norms.h"
#include "toolutil.h"
#include "utrie2.h"
#include "uvectr32.h"

U_NAMESPACE_BEGIN

ExtraData::ExtraData(Norms &n, UBool fast) :
        Norms::Enumerator(n),
        yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
        yesNoMappingsAndCompositions(1000, (UChar32)0, 1),  // 0=Hangul LV, 1=start of normal data
        yesNoMappingsOnly(1000, (UChar32)0, 1),  // 0=Hangul LVT, 1=start of normal data
        optimizeFast(fast) {
    // Hangul LV algorithmically decomposes to two Jamo.
    // Some code may harmlessly read this firstUnit.
    yesNoMappingsAndCompositions.setCharAt(0, 2);
    // Hangul LVT algorithmically decomposes to three Jamo.
    // Some code may harmlessly read this firstUnit.
    yesNoMappingsOnly.setCharAt(0, 3);
}

int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
    UnicodeString &m=*norm.mapping;
    int32_t length=m.length();
    // Write the mapping & raw mapping extraData.
    int32_t firstUnit=length|(norm.trailCC<<8);
    int32_t preMappingLength=0;
    if(norm.rawMapping!=NULL) {
        UnicodeString &rm=*norm.rawMapping;
        int32_t rmLength=rm.length();
        if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
            fprintf(stderr,
                    "gennorm2 error: "
                    "raw mapping for U+%04lX longer than maximum of %d\n",
                    (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
            exit(U_INVALID_FORMAT_ERROR);
        }
        UChar rm0=rm.charAt(0);
        if( rmLength==length-1 &&
            // 99: overlong substring lengths get pinned to remainder lengths anyway
            0==rm.compare(1, 99, m, 2, 99) &&
            rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
        ) {
            // Compression:
            // rawMapping=rm0+mapping.substring(2) -> store only rm0
            //
            // The raw mapping is the same as the final mapping after replacing
            // the final mapping's first two code units with the raw mapping's first one.
            // In this case, we store only that first unit, rm0.
            // This helps with a few hundred mappings.
            dataString.append(rm0);
            preMappingLength=1;
        } else {
            // Store the raw mapping with its length.
            dataString.append(rm);
            dataString.append((UChar)rmLength);
            preMappingLength=rmLength+1;
        }
        firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
    }
    int32_t cccLccc=norm.cc|(norm.leadCC<<8);
    if(cccLccc!=0) {
        dataString.append((UChar)cccLccc);
        ++preMappingLength;
        firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
    }
    dataString.append((UChar)firstUnit);
    dataString.append(m);
    return preMappingLength;
}

int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
                                    UnicodeString &dataString,
                                    Hashtable &previousMappings) {
    UnicodeString newMapping;
    int32_t offset=writeMapping(c, norm, newMapping);
    int32_t previousOffset=previousMappings.geti(newMapping);
    if(previousOffset!=0) {
        // Duplicate, point to the identical mapping that has already been stored.
        offset=previousOffset-1;
    } else {
        // Append this new mapping and
        // enter it into the hashtable, avoiding value 0 which is "not found".
        offset=dataString.length()+offset;
        dataString.append(newMapping);
        IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
        previousMappings.puti(newMapping, offset+1, errorCode);
    }
    return offset;
}

UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
    // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
    // Do not map from ASCII to non-ASCII.
    if(norm.mappingCP>=0 &&
            !(c<=0x7f && norm.mappingCP>0x7f) &&
            norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
        int32_t delta=norm.mappingCP-c;
        if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
            norm.type=Norm::NO_NO_DELTA;
            norm.offset=delta;
            return TRUE;
        }
    }
    return FALSE;
}

void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
    if(norm.cc!=0) {
        fprintf(stderr,
                "gennorm2 error: "
                "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
                (long)c);
        exit(U_INVALID_FORMAT_ERROR);
    }
    int32_t length;
    const CompositionPair *pairs=norm.getCompositionPairs(length);
    for(int32_t i=0; i<length; ++i) {
        const CompositionPair &pair=pairs[i];
        // 22 bits for the composite character and whether it combines forward.
        UChar32 compositeAndFwd=pair.composite<<1;
        if(norms.getNormRef(pair.composite).compositions!=NULL) {
            compositeAndFwd|=1;  // The composite character also combines-forward.
        }
        // Encode most pairs in two units and some in three.
        int32_t firstUnit, secondUnit, thirdUnit;
        if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
            if(compositeAndFwd<=0xffff) {
                firstUnit=pair.trail<<1;
                secondUnit=compositeAndFwd;
                thirdUnit=-1;
            } else {
                firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
                secondUnit=compositeAndFwd>>16;
                thirdUnit=compositeAndFwd;
            }
        } else {
            firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
                       (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
                      Normalizer2Impl::COMP_1_TRIPLE;
            secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
                       (compositeAndFwd>>16);
            thirdUnit=compositeAndFwd;
        }
        // Set the high bit of the first unit if this is the last composition pair.
        if(i==(length-1)) {
            firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
        }
        dataString.append((UChar)firstUnit).append((UChar)secondUnit);
        if(thirdUnit>=0) {
            dataString.append((UChar)thirdUnit);
        }
    }
}

void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
    if(start!=end) {
        fprintf(stderr,
                "gennorm2 error: unexpected shared data for "
                "multiple code points U+%04lX..U+%04lX\n",
                (long)start, (long)end);
        exit(U_INTERNAL_PROGRAM_ERROR);
    }
    if(norm.error!=nullptr) {
        fprintf(stderr, "gennorm2 error: U+%04lX %s\n", (long)start, norm.error);
        exit(U_INVALID_FORMAT_ERROR);
    }
    writeExtraData(start, norm);
}

//  Ticket #13342 - Disable optimizations on MSVC for this function as a workaround.
#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
#pragma optimize( "", off )
#endif

void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
    switch(norm.type) {
    case Norm::INERT:
        break;  // no extra data
    case Norm::YES_YES_COMBINES_FWD:
        norm.offset=yesYesCompositions.length();
        writeCompositions(c, norm, yesYesCompositions);
        break;
    case Norm::YES_NO_COMBINES_FWD:
        norm.offset=yesNoMappingsAndCompositions.length()+
                writeMapping(c, norm, yesNoMappingsAndCompositions);
        writeCompositions(c, norm, yesNoMappingsAndCompositions);
        break;
    case Norm::YES_NO_MAPPING_ONLY:
        norm.offset=yesNoMappingsOnly.length()+
                writeMapping(c, norm, yesNoMappingsOnly);
        break;
    case Norm::NO_NO_COMP_YES:
        if(!optimizeFast && setNoNoDelta(c, norm)) {
            break;
        }
        norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
        break;
    case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
        if(!optimizeFast && setNoNoDelta(c, norm)) {
            break;
        }
        norm.offset=writeNoNoMapping(
            c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
        break;
    case Norm::NO_NO_COMP_NO_MAYBE_CC:
        norm.offset=writeNoNoMapping(
            c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
        break;
    case Norm::NO_NO_EMPTY:
        // There can be multiple extra data entries for mappings to the empty string
        // if they have different raw mappings.
        norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
        break;
    case Norm::MAYBE_YES_COMBINES_FWD:
        norm.offset=maybeYesCompositions.length();
        writeCompositions(c, norm, maybeYesCompositions);
        break;
    case Norm::MAYBE_YES_SIMPLE:
        break;  // no extra data
    case Norm::YES_YES_WITH_CC:
        break;  // no extra data
    default:  // Should not occur.
        exit(U_INTERNAL_PROGRAM_ERROR);
    }
}

// Ticket #13342 - Turn optimization back on.
#if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210))
#pragma optimize( "", on )
#endif

U_NAMESPACE_END

#endif // #if !UCONFIG_NO_NORMALIZATION