utf16collationiterator.cpp   [plain text]


// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2010-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* utf16collationiterator.cpp
*
* created on: 2010oct27
* created by: Markus W. Scherer
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "charstr.h"
#include "cmemory.h"
#include "collation.h"
#include "collationdata.h"
#include "collationfcd.h"
#include "collationiterator.h"
#include "normalizer2impl.h"
#include "uassert.h"
#include "utf16collationiterator.h"

U_NAMESPACE_BEGIN

UTF16CollationIterator::UTF16CollationIterator(const UTF16CollationIterator &other,
                                               const UChar *newText)
        : CollationIterator(other),
          start(newText),
          pos(newText + (other.pos - other.start)),
          limit(other.limit == NULL ? NULL : newText + (other.limit - other.start)) {
}

UTF16CollationIterator::~UTF16CollationIterator() {}

UBool
UTF16CollationIterator::operator==(const CollationIterator &other) const {
    if(!CollationIterator::operator==(other)) { return FALSE; }
    const UTF16CollationIterator &o = static_cast<const UTF16CollationIterator &>(other);
    // Compare the iterator state but not the text: Assume that the caller does that.
    return (pos - start) == (o.pos - o.start);
}

void
UTF16CollationIterator::resetToOffset(int32_t newOffset) {
    reset();
    pos = start + newOffset;
}

int32_t
UTF16CollationIterator::getOffset() const {
    return (int32_t)(pos - start);
}

uint32_t
UTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) {
    if(pos == limit) {
        c = U_SENTINEL;
        return Collation::FALLBACK_CE32;
    }
    c = *pos++;
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
}

UChar
UTF16CollationIterator::handleGetTrailSurrogate() {
    if(pos == limit) { return 0; }
    UChar trail;
    if(U16_IS_TRAIL(trail = *pos)) { ++pos; }
    return trail;
}

UBool
UTF16CollationIterator::foundNULTerminator() {
    if(limit == NULL) {
        limit = --pos;
        return TRUE;
    } else {
        return FALSE;
    }
}

UChar32
UTF16CollationIterator::nextCodePoint(UErrorCode & /*errorCode*/) {
    if(pos == limit) {
        return U_SENTINEL;
    }
    UChar32 c = *pos;
    if(c == 0 && limit == NULL) {
        limit = pos;
        return U_SENTINEL;
    }
    ++pos;
    UChar trail;
    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
        ++pos;
        return U16_GET_SUPPLEMENTARY(c, trail);
    } else {
        return c;
    }
}

UChar32
UTF16CollationIterator::previousCodePoint(UErrorCode & /*errorCode*/) {
    if(pos == start) {
        return U_SENTINEL;
    }
    UChar32 c = *--pos;
    UChar lead;
    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
        --pos;
        return U16_GET_SUPPLEMENTARY(lead, c);
    } else {
        return c;
    }
}

void
UTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    while(num > 0 && pos != limit) {
        UChar32 c = *pos;
        if(c == 0 && limit == NULL) {
            limit = pos;
            break;
        }
        ++pos;
        --num;
        if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(*pos)) {
            ++pos;
        }
    }
}

void
UTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode & /*errorCode*/) {
    while(num > 0 && pos != start) {
        UChar32 c = *--pos;
        --num;
        if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(*(pos-1))) {
            --pos;
        }
    }
}

// FCDUTF16CollationIterator ----------------------------------------------- ***

FCDUTF16CollationIterator::FCDUTF16CollationIterator(const FCDUTF16CollationIterator &other,
                                                     const UChar *newText)
        : UTF16CollationIterator(other),
          rawStart(newText),
          segmentStart(newText + (other.segmentStart - other.rawStart)),
          segmentLimit(other.segmentLimit == NULL ? NULL : newText + (other.segmentLimit - other.rawStart)),
          rawLimit(other.rawLimit == NULL ? NULL : newText + (other.rawLimit - other.rawStart)),
          nfcImpl(other.nfcImpl),
          normalized(other.normalized),
          checkDir(other.checkDir) {
    if(checkDir != 0 || other.start == other.segmentStart) {
        start = newText + (other.start - other.rawStart);
        pos = newText + (other.pos - other.rawStart);
        limit = other.limit == NULL ? NULL : newText + (other.limit - other.rawStart);
    } else {
        start = normalized.getBuffer();
        pos = start + (other.pos - other.start);
        limit = start + normalized.length();
    }
}

FCDUTF16CollationIterator::~FCDUTF16CollationIterator() {}

UBool
FCDUTF16CollationIterator::operator==(const CollationIterator &other) const {
    // Skip the UTF16CollationIterator and call its parent.
    if(!CollationIterator::operator==(other)) { return FALSE; }
    const FCDUTF16CollationIterator &o = static_cast<const FCDUTF16CollationIterator &>(other);
    // Compare the iterator state but not the text: Assume that the caller does that.
    if(checkDir != o.checkDir) { return FALSE; }
    if(checkDir == 0 && (start == segmentStart) != (o.start == o.segmentStart)) { return FALSE; }
    if(checkDir != 0 || start == segmentStart) {
        return (pos - rawStart) == (o.pos - o.rawStart);
    } else {
        return (segmentStart - rawStart) == (o.segmentStart - o.rawStart) &&
                (pos - start) == (o.pos - o.start);
    }
}

void
FCDUTF16CollationIterator::resetToOffset(int32_t newOffset) {
    reset();
    start = segmentStart = pos = rawStart + newOffset;
    limit = rawLimit;
    checkDir = 1;
}

int32_t
FCDUTF16CollationIterator::getOffset() const {
    if(checkDir != 0 || start == segmentStart) {
        return (int32_t)(pos - rawStart);
    } else if(pos == start) {
        return (int32_t)(segmentStart - rawStart);
    } else {
        return (int32_t)(segmentLimit - rawStart);
    }
}

uint32_t
FCDUTF16CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) {
    for(;;) {
        if(checkDir > 0) {
            if(pos == limit) {
                c = U_SENTINEL;
                return Collation::FALLBACK_CE32;
            }
            c = *pos++;
            if(CollationFCD::hasTccc(c)) {
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
                        (pos != limit && CollationFCD::hasLccc(*pos))) {
                    --pos;
                    if(!nextSegment(errorCode)) {
                        c = U_SENTINEL;
                        return Collation::FALLBACK_CE32;
                    }
                    c = *pos++;
                }
            }
            break;
        } else if(checkDir == 0 && pos != limit) {
            c = *pos++;
            break;
        } else {
            switchToForward();
        }
    }
    return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c);
}

UBool
FCDUTF16CollationIterator::foundNULTerminator() {
    if(limit == NULL) {
        limit = rawLimit = --pos;
        return TRUE;
    } else {
        return FALSE;
    }
}

UChar32
FCDUTF16CollationIterator::nextCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(checkDir > 0) {
            if(pos == limit) {
                return U_SENTINEL;
            }
            c = *pos++;
            if(CollationFCD::hasTccc(c)) {
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
                        (pos != limit && CollationFCD::hasLccc(*pos))) {
                    --pos;
                    if(!nextSegment(errorCode)) {
                        return U_SENTINEL;
                    }
                    c = *pos++;
                }
            } else if(c == 0 && limit == NULL) {
                limit = rawLimit = --pos;
                return U_SENTINEL;
            }
            break;
        } else if(checkDir == 0 && pos != limit) {
            c = *pos++;
            break;
        } else {
            switchToForward();
        }
    }
    UChar trail;
    if(U16_IS_LEAD(c) && pos != limit && U16_IS_TRAIL(trail = *pos)) {
        ++pos;
        return U16_GET_SUPPLEMENTARY(c, trail);
    } else {
        return c;
    }
}

UChar32
FCDUTF16CollationIterator::previousCodePoint(UErrorCode &errorCode) {
    UChar32 c;
    for(;;) {
        if(checkDir < 0) {
            if(pos == start) {
                return U_SENTINEL;
            }
            c = *--pos;
            if(CollationFCD::hasLccc(c)) {
                if(CollationFCD::maybeTibetanCompositeVowel(c) ||
                        (pos != start && CollationFCD::hasTccc(*(pos - 1)))) {
                    ++pos;
                    if(!previousSegment(errorCode)) {
                        return U_SENTINEL;
                    }
                    c = *--pos;
                }
            }
            break;
        } else if(checkDir == 0 && pos != start) {
            c = *--pos;
            break;
        } else {
            switchToBackward();
        }
    }
    UChar lead;
    if(U16_IS_TRAIL(c) && pos != start && U16_IS_LEAD(lead = *(pos - 1))) {
        --pos;
        return U16_GET_SUPPLEMENTARY(lead, c);
    } else {
        return c;
    }
}

void
FCDUTF16CollationIterator::forwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    // Specify the class to avoid a virtual-function indirection.
    // In Java, we would declare this class final.
    while(num > 0 && FCDUTF16CollationIterator::nextCodePoint(errorCode) >= 0) {
        --num;
    }
}

void
FCDUTF16CollationIterator::backwardNumCodePoints(int32_t num, UErrorCode &errorCode) {
    // Specify the class to avoid a virtual-function indirection.
    // In Java, we would declare this class final.
    while(num > 0 && FCDUTF16CollationIterator::previousCodePoint(errorCode) >= 0) {
        --num;
    }
}

void
FCDUTF16CollationIterator::switchToForward() {
    U_ASSERT(checkDir < 0 || (checkDir == 0 && pos == limit));
    if(checkDir < 0) {
        // Turn around from backward checking.
        start = segmentStart = pos;
        if(pos == segmentLimit) {
            limit = rawLimit;
            checkDir = 1;  // Check forward.
        } else {  // pos < segmentLimit
            checkDir = 0;  // Stay in FCD segment.
        }
    } else {
        // Reached the end of the FCD segment.
        if(start == segmentStart) {
            // The input text segment is FCD, extend it forward.
        } else {
            // The input text segment needed to be normalized.
            // Switch to checking forward from it.
            pos = start = segmentStart = segmentLimit;
            // Note: If this segment is at the end of the input text,
            // then it might help to return FALSE to indicate that, so that
            // we do not have to re-check and normalize when we turn around and go backwards.
            // However, that would complicate the call sites for an optimization of an unusual case.
        }
        limit = rawLimit;
        checkDir = 1;
    }
}

UBool
FCDUTF16CollationIterator::nextSegment(UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return FALSE; }
    U_ASSERT(checkDir > 0 && pos != limit);
    // The input text [segmentStart..pos[ passes the FCD check.
    const UChar *p = pos;
    uint8_t prevCC = 0;
    for(;;) {
        // Fetch the next character's fcd16 value.
        const UChar *q = p;
        uint16_t fcd16 = nfcImpl.nextFCD16(p, rawLimit);
        uint8_t leadCC = (uint8_t)(fcd16 >> 8);
        if(leadCC == 0 && q != pos) {
            // FCD boundary before the [q, p[ character.
            limit = segmentLimit = q;
            break;
        }
        if(leadCC != 0 && (prevCC > leadCC || CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
            // Fails FCD check. Find the next FCD boundary and normalize.
            do {
                q = p;
            } while(p != rawLimit && nfcImpl.nextFCD16(p, rawLimit) > 0xff);
            if(!normalize(pos, q, errorCode)) { return FALSE; }
            pos = start;
            break;
        }
        prevCC = (uint8_t)fcd16;
        if(p == rawLimit || prevCC == 0) {
            // FCD boundary after the last character.
            limit = segmentLimit = p;
            break;
        }
    }
    U_ASSERT(pos != limit);
    checkDir = 0;
    return TRUE;
}

void
FCDUTF16CollationIterator::switchToBackward() {
    U_ASSERT(checkDir > 0 || (checkDir == 0 && pos == start));
    if(checkDir > 0) {
        // Turn around from forward checking.
        limit = segmentLimit = pos;
        if(pos == segmentStart) {
            start = rawStart;
            checkDir = -1;  // Check backward.
        } else {  // pos > segmentStart
            checkDir = 0;  // Stay in FCD segment.
        }
    } else {
        // Reached the start of the FCD segment.
        if(start == segmentStart) {
            // The input text segment is FCD, extend it backward.
        } else {
            // The input text segment needed to be normalized.
            // Switch to checking backward from it.
            pos = limit = segmentLimit = segmentStart;
        }
        start = rawStart;
        checkDir = -1;
    }
}

UBool
FCDUTF16CollationIterator::previousSegment(UErrorCode &errorCode) {
    if(U_FAILURE(errorCode)) { return FALSE; }
    U_ASSERT(checkDir < 0 && pos != start);
    // The input text [pos..segmentLimit[ passes the FCD check.
    const UChar *p = pos;
    uint8_t nextCC = 0;
    for(;;) {
        // Fetch the previous character's fcd16 value.
        const UChar *q = p;
        uint16_t fcd16 = nfcImpl.previousFCD16(rawStart, p);
        uint8_t trailCC = (uint8_t)fcd16;
        if(trailCC == 0 && q != pos) {
            // FCD boundary after the [p, q[ character.
            start = segmentStart = q;
            break;
        }
        if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) ||
                            CollationFCD::isFCD16OfTibetanCompositeVowel(fcd16))) {
            // Fails FCD check. Find the previous FCD boundary and normalize.
            do {
                q = p;
            } while(fcd16 > 0xff && p != rawStart &&
                    (fcd16 = nfcImpl.previousFCD16(rawStart, p)) != 0);
            if(!normalize(q, pos, errorCode)) { return FALSE; }
            pos = limit;
            break;
        }
        nextCC = (uint8_t)(fcd16 >> 8);
        if(p == rawStart || nextCC == 0) {
            // FCD boundary before the following character.
            start = segmentStart = p;
            break;
        }
    }
    U_ASSERT(pos != start);
    checkDir = 0;
    return TRUE;
}

UBool
FCDUTF16CollationIterator::normalize(const UChar *from, const UChar *to, UErrorCode &errorCode) {
    // NFD without argument checking.
    U_ASSERT(U_SUCCESS(errorCode));
    nfcImpl.decompose(from, to, normalized, (int32_t)(to - from), errorCode);
    if(U_FAILURE(errorCode)) { return FALSE; }
    // Switch collation processing into the FCD buffer
    // with the result of normalizing [segmentStart, segmentLimit[.
    segmentStart = from;
    segmentLimit = to;
    start = normalized.getBuffer();
    limit = start + normalized.length();
    return TRUE;
}

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION