numbertest_parse.cpp   [plain text]


// ยฉ 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

#include "numbertest.h"
#include "numparse_impl.h"
#include "static_unicode_sets.h"
#include "unicode/dcfmtsym.h"
#include "unicode/testlog.h"

#include <cmath>
#include <numparse_affixes.h>

void NumberParserTest::runIndexedTest(int32_t index, UBool exec, const char*& name, char*) {
    if (exec) {
        logln("TestSuite NumberParserTest: ");
    }
    TESTCASE_AUTO_BEGIN;
        TESTCASE_AUTO(testBasic);
        TESTCASE_AUTO(testSeriesMatcher);
        TESTCASE_AUTO(testCombinedCurrencyMatcher);
        TESTCASE_AUTO(testAffixPatternMatcher);
        TESTCASE_AUTO(test20360_BidiOverflow);
        TESTCASE_AUTO(testInfiniteRecursion);
    TESTCASE_AUTO_END;
}

void NumberParserTest::testBasic() {
    IcuTestErrorCode status(*this, "testBasic");

    static const struct TestCase {
        int32_t flags;
        const char16_t* inputString;
        const char16_t* patternString;
        int32_t expectedCharsConsumed;
        double expectedResultDouble;
    } cases[] = {{3, u"51423", u"0", 5, 51423.},
                 {3, u"51423x", u"0", 5, 51423.},
                 {3, u" 51423", u"0", 6, 51423.},
                 {3, u"51423 ", u"0", 5, 51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"0", 10, 51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏx", u"0", 10, 51423.},
                 {3, u" ๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"0", 11, 51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ ", u"0", 10, 51423.},
                 {7, u"51,423", u"#,##,##0", 6, 51423.},
                 {7, u" 51,423", u"#,##,##0", 7, 51423.},
                 {7, u"51,423 ", u"#,##,##0", 6, 51423.},
                 {7, u"51,423,", u"#,##,##0", 6, 51423.},
                 {7, u"51,423,,", u"#,##,##0", 6, 51423.},
                 {7, u"51,423.5", u"#,##,##0", 8, 51423.5},
                 {7, u"51,423.5,", u"#,##,##0", 8, 51423.5},
                 {7, u"51,423.5,,", u"#,##,##0", 8, 51423.5},
                 {7, u"51,423.5.", u"#,##,##0", 8, 51423.5},
                 {7, u"51,423.5..", u"#,##,##0", 8, 51423.5},
                 {7, u"๐Ÿฑ๐Ÿญ,๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"#,##,##0", 11, 51423.},
                 {7, u"๐Ÿณ,๐Ÿด๐Ÿต,๐Ÿฑ๐Ÿญ,๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"#,##,##0", 19, 78951423.},
                 {7, u"๐Ÿณ๐Ÿด,๐Ÿต๐Ÿฑ๐Ÿญ.๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"#,##,##0", 18, 78951.423},
                 {7, u"๐Ÿณ๐Ÿด,๐Ÿฌ๐Ÿฌ๐Ÿฌ", u"#,##,##0", 11, 78000.},
                 {7, u"๐Ÿณ๐Ÿด,๐Ÿฌ๐Ÿฌ๐Ÿฌ.๐Ÿฌ๐Ÿฌ๐Ÿฌ", u"#,##,##0", 18, 78000.},
                 {7, u"๐Ÿณ๐Ÿด,๐Ÿฌ๐Ÿฌ๐Ÿฌ.๐Ÿฌ๐Ÿฎ๐Ÿฏ", u"#,##,##0", 18, 78000.023},
                 {7, u"๐Ÿณ๐Ÿด.๐Ÿฌ๐Ÿฌ๐Ÿฌ.๐Ÿฌ๐Ÿฎ๐Ÿฏ", u"#,##,##0", 11, 78.},
                 {7, u"1,", u"#,##,##0", 1, 1.},
                 {7, u"1,,", u"#,##,##0", 1, 1.},
                 {7, u"1.,", u"#,##,##0", 2, 1.},
                 {3, u"1,.", u"#,##,##0", 3, 1.},
                 {7, u"1..", u"#,##,##0", 2, 1.},
                 {3, u",1", u"#,##,##0", 2, 1.},
                 {3, u"1,1", u"#,##,##0", 1, 1.},
                 {3, u"1,1,", u"#,##,##0", 1, 1.},
                 {3, u"1,1,,", u"#,##,##0", 1, 1.},
                 {3, u"1,1,1", u"#,##,##0", 1, 1.},
                 {3, u"1,1,1,", u"#,##,##0", 1, 1.},
                 {3, u"1,1,1,,", u"#,##,##0", 1, 1.},
                 {3, u"-51423", u"0", 6, -51423.},
                 {3, u"51423-", u"0", 5, 51423.}, // plus and minus sign by default do NOT match after
                 {3, u"+51423", u"0", 6, 51423.},
                 {3, u"51423+", u"0", 5, 51423.}, // plus and minus sign by default do NOT match after
                 {3, u"%51423", u"0", 6, 51423.},
                 {3, u"51423%", u"0", 6, 51423.},
                 {3, u"51423%%", u"0", 6, 51423.},
                 {3, u"โ€ฐ51423", u"0", 6, 51423.},
                 {3, u"51423โ€ฐ", u"0", 6, 51423.},
                 {3, u"51423โ€ฐโ€ฐ", u"0", 6, 51423.},
                 {3, u"โˆž", u"0", 1, INFINITY},
                 {3, u"-โˆž", u"0", 2, -INFINITY},
                 {3, u"@@@123  @@", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak?
                 {3, u"@@@123@@  ", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak?
                 {3, u"a51423US dollars", u"a0ยคยคยค", 16, 51423.},
                 {3, u"a 51423 US dollars", u"a0ยคยคยค", 18, 51423.},
                 {3, u"514.23 USD", u"ยค0", 10, 514.23},
                 {3, u"514.23 GBP", u"ยค0", 10, 514.23},
                 {3, u"a ๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ b", u"a0b", 14, 51423.},
                 {3, u"-a ๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ b", u"a0b", 15, -51423.},
                 {3, u"a -๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ b", u"a0b", 15, -51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"[0];(0)", 10, 51423.},
                 {3, u"[๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"[0];(0)", 11, 51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ]", u"[0];(0)", 11, 51423.},
                 {3, u"[๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ]", u"[0];(0)", 12, 51423.},
                 {3, u"(๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"[0];(0)", 11, -51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ)", u"[0];(0)", 11, -51423.},
                 {3, u"(๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ)", u"[0];(0)", 12, -51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"{0};{0}", 10, 51423.},
                 {3, u"{๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ", u"{0};{0}", 11, 51423.},
                 {3, u"๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ}", u"{0};{0}", 11, 51423.},
                 {3, u"{๐Ÿฑ๐Ÿญ๐Ÿฐ๐Ÿฎ๐Ÿฏ}", u"{0};{0}", 12, 51423.},
                 {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
                 //{2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b" // with <rdar://problem/51938595>, slow has same result as greedy
                 {3, u"๐Ÿฑ.๐Ÿญ๐Ÿฐ๐ŸฎE๐Ÿฏ", u"0", 12, 5142.},
                 {3, u"๐Ÿฑ.๐Ÿญ๐Ÿฐ๐ŸฎE-๐Ÿฏ", u"0", 13, 0.005142},
                 {3, u"๐Ÿฑ.๐Ÿญ๐Ÿฐ๐Ÿฎe-๐Ÿฏ", u"0", 13, 0.005142},
                 {3, u"5.142e+3", u"0", 8, 5142.0 },
                 {3, u"5.142\u200Ee+3", u"0", 9, 5142.0},
                 {3, u"5.142e\u200E+3", u"0", 9, 5142.0},
                 {3, u"5.142e+\u200E3", u"0", 9, 5142.0},
                 {7, u"5,142.50 Canadian dollars", u"#,##,##0 ยคยคยค", 25, 5142.5},
                 {3, u"a$ b5", u"a ยค b0", 5, 5.0},
                 {3, u"๐Ÿ“บ1.23", u"๐Ÿ“บ0;๐Ÿ“ป0", 6, 1.23},
                 {3, u"๐Ÿ“ป1.23", u"๐Ÿ“บ0;๐Ÿ“ป0", 6, -1.23},
                 {3, u".00", u"0", 3, 0.0},
                 {3, u"                              1,234", u"a0", 35, 1234.}, // should not hang
                 {3, u"NaN", u"0", 3, NAN},
                 {3, u"NaN E5", u"0", 6, NAN},
                 {3, u"0", u"0", 1, 0.0}};

    parse_flags_t parseFlags = PARSE_FLAG_IGNORE_CASE | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
    for (auto& cas : cases) {
        UnicodeString inputString(cas.inputString);
        UnicodeString patternString(cas.patternString);
        LocalPointer<const NumberParserImpl> parser(
                NumberParserImpl::createSimpleParser(
                        Locale("en"), patternString, parseFlags, status));
        if (status.errDataIfFailureAndReset("createSimpleParser() failed")) {
            continue;
        }
        UnicodeString message =
                UnicodeString("Input <") + inputString + UnicodeString("> Parser ") + parser->toString();

        if (0 != (cas.flags & 0x01)) {
            // Test greedy code path
            ParsedNumber resultObject;
            parser->parse(inputString, true, resultObject, status);
            assertTrue("Greedy Parse failed: " + message, resultObject.success());
            assertEquals("Greedy Parse failed: " + message,
                cas.expectedCharsConsumed, resultObject.charEnd);
            assertEquals("Greedy Parse failed: " + message,
                cas.expectedResultDouble, resultObject.getDouble(status));
        }

        if (0 != (cas.flags & 0x02)) {
            // Test slow code path
            ParsedNumber resultObject;
            parser->parse(inputString, false, resultObject, status);
            assertTrue("Non-Greedy Parse failed: " + message, resultObject.success());
            assertEquals(
                    "Non-Greedy Parse failed: " + message,
                    cas.expectedCharsConsumed,
                    resultObject.charEnd);
            assertEquals(
                    "Non-Greedy Parse failed: " + message,
                    cas.expectedResultDouble,
                    resultObject.getDouble(status));
        }

        if (0 != (cas.flags & 0x04)) {
            // Test with strict separators
            parser.adoptInstead(
                    NumberParserImpl::createSimpleParser(
                            Locale("en"),
                            patternString,
                            parseFlags | PARSE_FLAG_STRICT_GROUPING_SIZE,
                            status));
            ParsedNumber resultObject;
            parser->parse(inputString, true, resultObject, status);
            assertTrue("Strict Parse failed: " + message, resultObject.success());
            assertEquals("Strict Parse failed: " + message,
                cas.expectedCharsConsumed, resultObject.charEnd);
            assertEquals("Strict Parse failed: " + message,
                cas.expectedResultDouble, resultObject.getDouble(status));
        }
    }
}

void NumberParserTest::testSeriesMatcher() {
    IcuTestErrorCode status(*this, "testSeriesMatcher");

    DecimalFormatSymbols symbols("en", status);
    if (status.errDataIfFailureAndReset("Failure in DecimalFormtSymbols constructor")) {
        return;
    }
    PlusSignMatcher m0(symbols, false);
    MinusSignMatcher m1(symbols, false);
    IgnorablesMatcher m2(0);
    PercentMatcher m3(symbols);
    IgnorablesMatcher m4(0);

    ArraySeriesMatcher::MatcherArray matchers(5);
    matchers[0] = &m0;
    matchers[1] = &m1;
    matchers[2] = &m2;
    matchers[3] = &m3;
    matchers[4] = &m4;
    ArraySeriesMatcher series(matchers, 5);

    assertFalse("", series.smokeTest(StringSegment(u"x", false)));
    assertFalse("", series.smokeTest(StringSegment(u"-", false)));
    assertTrue("", series.smokeTest(StringSegment(u"+", false)));

    static const struct TestCase {
        const char16_t* input;
        int32_t expectedOffset;
        bool expectedMaybeMore;
    } cases[] = {{u"", 0, true},
                 {u" ", 0, false},
                 {u"$", 0, false},
                 {u"+", 0, true},
                 {u" +", 0, false},
                 {u"+-", 0, true},
                 {u"+ -", 0, false},
                 {u"+-  ", 0, true},
                 {u"+-  $", 0, false},
                 {u"+-%", 3, true},
                 {u"  +-  %  ", 0, false},
                 {u"+-  %  ", 7, true},
                 {u"+-%$", 3, false}};

    for (auto& cas : cases) {
        UnicodeString input(cas.input);

        StringSegment segment(input, false);
        ParsedNumber result;
        bool actualMaybeMore = series.match(segment, result, status);
        int actualOffset = segment.getOffset();

        assertEquals("'" + input + "'", cas.expectedOffset, actualOffset);
        assertEquals("'" + input + "'", cas.expectedMaybeMore, actualMaybeMore);
    }
}

void NumberParserTest::testCombinedCurrencyMatcher() {
    IcuTestErrorCode status(*this, "testCombinedCurrencyMatcher");

    IgnorablesMatcher ignorables(0);
    Locale locale = Locale::getEnglish();

    DecimalFormatSymbols dfs(locale, status);
    if (status.errDataIfFailureAndReset("Failure in DecimalFormtSymbols constructor")) {
        return;
    }
    dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$", status);
    dfs.setSymbol(DecimalFormatSymbols::kIntlCurrencySymbol, u"ICU", status);
    CurrencySymbols currencySymbols({u"ICU", status}, locale, dfs, status);

    AffixTokenMatcherSetupData affixSetupData = {
            currencySymbols, {"en", status}, ignorables, "en", 0};
    AffixTokenMatcherWarehouse warehouse(&affixSetupData);
    NumberParseMatcher& matcher = warehouse.currency(status);
    affixSetupData.parseFlags = PARSE_FLAG_NO_FOREIGN_CURRENCY;
    AffixTokenMatcherWarehouse warehouseNoForeign(&affixSetupData);
    NumberParseMatcher& matcherNoForeign = warehouseNoForeign.currency(status);

    static const struct TestCase {
        const char16_t* input;
        const char16_t* expectedCurrencyCode;
        const char16_t* expectedNoForeignCurrencyCode;
    } cases[]{{u"", u"", u""},
              {u"FOO", u"", u""},
              {u"USD", u"USD", u""},
              {u"$", u"USD", u""},
              {u"US dollars", u"USD", u""},
              {u"eu", u"", u""},
              {u"euros", u"EUR", u""},
              {u"ICU", u"ICU", u"ICU"},
              {u"IU$", u"ICU", u"ICU"}};
    for (auto& cas : cases) {
        UnicodeString input(cas.input);

        {
            StringSegment segment(input, false);
            ParsedNumber result;
            matcher.match(segment, result, status);
            assertEquals(
                    "Parsing " + input,
                    cas.expectedCurrencyCode,
                    result.currencyCode);
            assertEquals(
                    "Whole string on " + input,
                    cas.expectedCurrencyCode[0] == 0 ? 0 : input.length(),
                    result.charEnd);
        }
        {
            StringSegment segment(input, false);
            ParsedNumber result;
            matcherNoForeign.match(segment, result, status);
            assertEquals(
                    "[no foreign] Parsing " + input,
                    cas.expectedNoForeignCurrencyCode,
                    result.currencyCode);
            assertEquals(
                    "[no foreign] Whole string on " + input,
                    cas.expectedNoForeignCurrencyCode[0] == 0 ? 0 : input.length(),
                    result.charEnd);
        }
    }
}

void NumberParserTest::testAffixPatternMatcher() {
    IcuTestErrorCode status(*this, "testAffixPatternMatcher");
    Locale locale = Locale::getEnglish();
    IgnorablesMatcher ignorables(0);

    DecimalFormatSymbols dfs(locale, status);
    dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$", status);
    dfs.setSymbol(DecimalFormatSymbols::kIntlCurrencySymbol, u"ICU", status);
    CurrencySymbols currencySymbols({u"ICU", status}, locale, dfs, status);

    AffixTokenMatcherSetupData affixSetupData = {
            currencySymbols, {"en", status}, ignorables, "en", 0};
    AffixTokenMatcherWarehouse warehouse(&affixSetupData);

    static const struct TestCase {
        bool exactMatch;
        const char16_t* affixPattern;
        int32_t expectedMatcherLength;
        const char16_t* sampleParseableString;
    } cases[] = {{false, u"-", 1, u"-"},
                 {false, u"+-%", 5, u"+-%"},
                 {true, u"+-%", 3, u"+-%"},
                 {false, u"ab c", 5, u"a    bc"},
                 {true, u"abc", 3, u"abc"},
                 {false, u"hello-to+this%veryยคlongโ€ฐstring", 59, u"hello-to+this%very USD longโ€ฐstring"}};

    for (auto& cas : cases) {
        UnicodeString affixPattern(cas.affixPattern);
        UnicodeString sampleParseableString(cas.sampleParseableString);
        int parseFlags = cas.exactMatch ? PARSE_FLAG_EXACT_AFFIX : 0;

        bool success;
        AffixPatternMatcher matcher = AffixPatternMatcher::fromAffixPattern(
                affixPattern, warehouse, parseFlags, &success, status);
        if (!status.errDataIfFailureAndReset("Creation should be successful")) {

            // Check that the matcher has the expected number of children
            assertEquals(affixPattern + " " + cas.exactMatch, cas.expectedMatcherLength, matcher.length());

            // Check that the matcher works on a sample string
            StringSegment segment(sampleParseableString, false);
            ParsedNumber result;
            matcher.match(segment, result, status);
            assertEquals(affixPattern + " " + cas.exactMatch, sampleParseableString.length(), result.charEnd);
        }
    }
}

void NumberParserTest::test20360_BidiOverflow() {
    IcuTestErrorCode status(*this, "test20360_BidiOverflow");
    UnicodeString inputString;
    inputString.append(u'-');
    for (int32_t i=0; i<100000; i++) {
        inputString.append(u'\u061C');
    }
    inputString.append(u'5');

    LocalPointer<const NumberParserImpl> parser(NumberParserImpl::createSimpleParser("en", u"0", 0, status));
    if (status.errDataIfFailureAndReset("createSimpleParser() failed")) {
        return;
    }

    ParsedNumber resultObject;
    parser->parse(inputString, true, resultObject, status);
    assertTrue("Greedy Parse, success", resultObject.success());
    assertEquals("Greedy Parse, chars consumed", 100002, resultObject.charEnd);
    assertEquals("Greedy Parse, expected double", -5.0, resultObject.getDouble(status));

    resultObject.clear();
    parser->parse(inputString, false, resultObject, status);
    assertFalse("Non-Greedy Parse, success", resultObject.success());
    assertEquals("Non-Greedy Parse, chars consumed", 1, resultObject.charEnd);
}

void NumberParserTest::testInfiniteRecursion() {
    IcuTestErrorCode status(*this, "testInfiniteRecursion");
    UnicodeString inputString;
    inputString.append(u'-');
    for (int32_t i=0; i<200; i++) {
        inputString.append(u'\u061C');
    }
    inputString.append(u'5');

    LocalPointer<const NumberParserImpl> parser(NumberParserImpl::createSimpleParser("en", u"0", 0, status));
    if (status.errDataIfFailureAndReset("createSimpleParser() failed")) {
        return;
    }

    ParsedNumber resultObject;
    parser->parse(inputString, false, resultObject, status);
    assertFalse("Default recursion limit, success", resultObject.success());
    assertEquals("Default recursion limit, chars consumed", 1, resultObject.charEnd);

    parser.adoptInstead(NumberParserImpl::createSimpleParser(
        "en", u"0", PARSE_FLAG_ALLOW_INFINITE_RECURSION, status));
    resultObject.clear();
    parser->parse(inputString, false, resultObject, status);
    assertTrue("Unlimited recursion, success", resultObject.success());
    assertEquals("Unlimited recursion, chars consumed", 202, resultObject.charEnd);
    assertEquals("Unlimited recursion, expected double", -5.0, resultObject.getDouble(status));
}


#endif