numparse_affixes.cpp   [plain text]


// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT

#include "numparse_types.h"
#include "numparse_affixes.h"
#include "numparse_utils.h"
#include "number_utils.h"

using namespace icu;
using namespace icu::numparse;
using namespace icu::numparse::impl;
using namespace icu::number;
using namespace icu::number::impl;


namespace {

/**
 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
 * the given pattern string.
 */
static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
    return (affix == nullptr && patternString.isBogus()) ||
           (affix != nullptr && affix->getPattern() == patternString);
}

/**
 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
 */
static int32_t length(const AffixPatternMatcher* matcher) {
    return matcher == nullptr ? 0 : matcher->getPattern().length();
}

/**
 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
 * valid, whether they are equal according to operator==.  Similar to Java Objects.equals()
 */
static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
    if (lhs == nullptr && rhs == nullptr) {
        return true;
    }
    if (lhs == nullptr || rhs == nullptr) {
        return false;
    }
    return *lhs == *rhs;
}

}


AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
                                                       AffixTokenMatcherWarehouse& warehouse,
                                                       IgnorablesMatcher* ignorables)
        : fMatchersLen(0),
          fLastTypeOrCp(0),
          fPattern(pattern),
          fWarehouse(warehouse),
          fIgnorables(ignorables) {}

void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
    // This is called by AffixUtils.iterateWithConsumer() for each token.

    // Add an ignorables matcher between tokens except between two literals, and don't put two
    // ignorables matchers in a row.
    if (fIgnorables != nullptr && fMatchersLen > 0 &&
        (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
        addMatcher(*fIgnorables);
    }

    if (type != TYPE_CODEPOINT) {
        // Case 1: the token is a symbol.
        switch (type) {
            case TYPE_MINUS_SIGN:
                addMatcher(fWarehouse.minusSign());
                break;
            case TYPE_PLUS_SIGN:
                addMatcher(fWarehouse.plusSign());
                break;
            case TYPE_PERCENT:
                addMatcher(fWarehouse.percent());
                break;
            case TYPE_PERMILLE:
                addMatcher(fWarehouse.permille());
                break;
            case TYPE_CURRENCY_SINGLE:
            case TYPE_CURRENCY_DOUBLE:
            case TYPE_CURRENCY_TRIPLE:
            case TYPE_CURRENCY_QUAD:
            case TYPE_CURRENCY_QUINT:
                // All currency symbols use the same matcher
                addMatcher(fWarehouse.currency(status));
                break;
            default:
                U_ASSERT(FALSE);
        }

    } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
        // Case 2: the token is an ignorable literal.
        // No action necessary: the ignorables matcher has already been added.

    } else {
        // Case 3: the token is a non-ignorable literal.
        addMatcher(fWarehouse.nextCodePointMatcher(cp));
    }
    fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
}

void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
    if (fMatchersLen >= fMatchers.getCapacity()) {
        fMatchers.resize(fMatchersLen * 2, fMatchersLen);
    }
    fMatchers[fMatchersLen++] = &matcher;
}

AffixPatternMatcher AffixPatternMatcherBuilder::build() {
    return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern);
}


CodePointMatcherWarehouse::CodePointMatcherWarehouse()
        : codePointCount(0), codePointNumBatches(0) {}

CodePointMatcherWarehouse::~CodePointMatcherWarehouse() {
    // Delete the variable number of batches of code point matchers
    for (int32_t i = 0; i < codePointNumBatches; i++) {
        delete[] codePointsOverflow[i];
    }
}

CodePointMatcherWarehouse::CodePointMatcherWarehouse(CodePointMatcherWarehouse&& src) U_NOEXCEPT
        : codePoints(std::move(src.codePoints)),
          codePointsOverflow(std::move(src.codePointsOverflow)),
          codePointCount(src.codePointCount),
          codePointNumBatches(src.codePointNumBatches) {}

CodePointMatcherWarehouse&
CodePointMatcherWarehouse::operator=(CodePointMatcherWarehouse&& src) U_NOEXCEPT {
    codePoints = std::move(src.codePoints);
    codePointsOverflow = std::move(src.codePointsOverflow);
    codePointCount = src.codePointCount;
    codePointNumBatches = src.codePointNumBatches;
    return *this;
}

NumberParseMatcher& CodePointMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
    if (codePointCount < CODE_POINT_STACK_CAPACITY) {
        return codePoints[codePointCount++] = {cp};
    }
    int32_t totalCapacity = CODE_POINT_STACK_CAPACITY + codePointNumBatches * CODE_POINT_BATCH_SIZE;
    if (codePointCount >= totalCapacity) {
        // Need a new batch
        auto* nextBatch = new CodePointMatcher[CODE_POINT_BATCH_SIZE];
        if (codePointNumBatches >= codePointsOverflow.getCapacity()) {
            // Need more room for storing pointers to batches
            codePointsOverflow.resize(codePointNumBatches * 2, codePointNumBatches);
        }
        codePointsOverflow[codePointNumBatches++] = nextBatch;
    }
    return codePointsOverflow[codePointNumBatches - 1][(codePointCount++ - CODE_POINT_STACK_CAPACITY) %
                                                       CODE_POINT_BATCH_SIZE] = {cp};
}


AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
        : fSetupData(setupData) {}

NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
    return fMinusSign = {fSetupData->dfs, true};
}

NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
    return fPlusSign = {fSetupData->dfs, true};
}

NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
    return fPercent = {fSetupData->dfs};
}

NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
    return fPermille = {fSetupData->dfs};
}

NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
    return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
}

IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
    return fSetupData->ignorables;
}

NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
    return fCodePoints.nextCodePointMatcher(cp);
}


CodePointMatcher::CodePointMatcher(UChar32 cp)
        : fCp(cp) {}

bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
    if (segment.startsWith(fCp)) {
        segment.adjustOffsetByCodePoint();
        result.setCharsConsumed(segment);
    }
    return false;
}

bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
    return segment.startsWith(fCp);
}

UnicodeString CodePointMatcher::toString() const {
    return u"<CodePoint>";
}


AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
                                                          AffixTokenMatcherWarehouse& tokenWarehouse,
                                                          parse_flags_t parseFlags, bool* success,
                                                          UErrorCode& status) {
    if (affixPattern.isEmpty()) {
        *success = false;
        return {};
    }
    *success = true;

    IgnorablesMatcher* ignorables;
    if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
        ignorables = nullptr;
    } else {
        ignorables = &tokenWarehouse.ignorables();
    }

    AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
    AffixUtils::iterateWithConsumer(affixPattern, builder, status);
    return builder.build();
}

AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
                                         const UnicodeString& pattern)
        : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {}

UnicodeString AffixPatternMatcher::getPattern() const {
    return fPattern.toAliasedUnicodeString();
}

bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
    return fPattern == other.fPattern;
}


AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
        : fTokenWarehouse(tokenWarehouse) {
}

bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
                                          const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
                                          UErrorCode& status) {
    UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
    UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
    UnicodeString negPrefixString;
    UnicodeString negSuffixString;
    if (patternInfo.hasNegativeSubpattern()) {
        negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
        negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
    }

    if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
        AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
        AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
        AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
        AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
        // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
        // trailing in the pattern string.
        && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
        !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
        !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
        !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
        // The affixes contain only symbols and ignorables.
        // No need to generate affix matchers.
        return false;
    }
    return true;
}

void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
                                                MutableMatcherCollection& output,
                                                const IgnorablesMatcher& ignorables,
                                                parse_flags_t parseFlags, UErrorCode& status) {
    if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
        return;
    }

    // The affixes have interesting characters, or we are in strict mode.
    // Use initial capacity of 6, the highest possible number of AffixMatchers.
    UnicodeString sb;
    bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
    UNumberSignDisplay signDisplay = (0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) ? UNUM_SIGN_ALWAYS
                                                                                        : UNUM_SIGN_AUTO;

    int32_t numAffixMatchers = 0;
    int32_t numAffixPatternMatchers = 0;

    AffixPatternMatcher* posPrefix = nullptr;
    AffixPatternMatcher* posSuffix = nullptr;

    // Pre-process the affix strings to resolve LDML rules like sign display.
    for (int8_t signum = 1; signum >= -1; signum--) {
        // Generate Prefix
        bool hasPrefix = false;
        PatternStringUtils::patternInfoToStringBuilder(
                patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb);
        fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
                sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
        AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
                                                : nullptr;

        // Generate Suffix
        bool hasSuffix = false;
        PatternStringUtils::patternInfoToStringBuilder(
                patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb);
        fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
                sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
        AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
                                                : nullptr;

        if (signum == 1) {
            posPrefix = prefix;
            posSuffix = suffix;
        } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
            // Skip adding these matchers (we already have equivalents)
            continue;
        }

        // Flags for setting in the ParsedNumber; the token matchers may add more.
        int flags = (signum == -1) ? FLAG_NEGATIVE : 0;

        // Note: it is indeed possible for posPrefix and posSuffix to both be null.
        // We still need to add that matcher for strict mode to work.
        fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
        if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
            // The following if statements are designed to prevent adding two identical matchers.
            if (signum == 1 || !equals(prefix, posPrefix)) {
                fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
            }
            if (signum == 1 || !equals(suffix, posSuffix)) {
                fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
            }
        }
    }

    // Put the AffixMatchers in order, and then add them to the output.
    // Since there are at most 9 elements, do a simple-to-implement bubble sort.
    bool madeChanges;
    do {
        madeChanges = false;
        for (int32_t i = 1; i < numAffixMatchers; i++) {
            if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
                madeChanges = true;
                AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
                fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
                fAffixMatchers[i] = std::move(temp);
            }
        }
    } while (madeChanges);

    for (int32_t i = 0; i < numAffixMatchers; i++) {
        // Enable the following line to debug affixes
        //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
        output.addMatcher(fAffixMatchers[i]);
    }
}


AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
        : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}

bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
    if (!result.seenNumber()) {
        // Prefix
        // Do not match if:
        // 1. We have already seen a prefix (result.prefix != null)
        // 2. The prefix in this AffixMatcher is empty (prefix == null)
        if (!result.prefix.isBogus() || fPrefix == nullptr) {
            return false;
        }

        // Attempt to match the prefix.
        int initialOffset = segment.getOffset();
        bool maybeMore = fPrefix->match(segment, result, status);
        if (initialOffset != segment.getOffset()) {
            result.prefix = fPrefix->getPattern();
        }
        return maybeMore;

    } else {
        // Suffix
        // Do not match if:
        // 1. We have already seen a suffix (result.suffix != null)
        // 2. The suffix in this AffixMatcher is empty (suffix == null)
        // 3. The matched prefix does not equal this AffixMatcher's prefix
        if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
            return false;
        }

        // Attempt to match the suffix.
        int initialOffset = segment.getOffset();
        bool maybeMore = fSuffix->match(segment, result, status);
        if (initialOffset != segment.getOffset()) {
            result.suffix = fSuffix->getPattern();
        }
        return maybeMore;
    }
}

bool AffixMatcher::smokeTest(const StringSegment& segment) const {
    return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
           (fSuffix != nullptr && fSuffix->smokeTest(segment));
}

void AffixMatcher::postProcess(ParsedNumber& result) const {
    // Check to see if our affix is the one that was matched. If so, set the flags in the result.
    if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
        // Fill in the result prefix and suffix with non-null values (empty string).
        // Used by strict mode to determine whether an entire affix pair was matched.
        if (result.prefix.isBogus()) {
            result.prefix = UnicodeString();
        }
        if (result.suffix.isBogus()) {
            result.suffix = UnicodeString();
        }
        result.flags |= fFlags;
        if (fPrefix != nullptr) {
            fPrefix->postProcess(result);
        }
        if (fSuffix != nullptr) {
            fSuffix->postProcess(result);
        }
    }
}

int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
    const AffixMatcher& lhs = *this;
    if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
        return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
    } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
        return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
    } else {
        return 0;
    }
}

UnicodeString AffixMatcher::toString() const {
    bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
    return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
           (fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
           (fSuffix ? fSuffix->getPattern() : u"null") + u">";

}


#endif /* #if !UCONFIG_NO_FORMATTING */