collationrootelements.cpp   [plain text]


/*
*******************************************************************************
* Copyright (C) 2013-2014, International Business Machines
* Corporation and others.  All Rights Reserved.
*******************************************************************************
* collationrootelements.cpp
*
* created on: 2013mar05
* created by: Markus W. Scherer
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "collation.h"
#include "collationrootelements.h"
#include "uassert.h"

U_NAMESPACE_BEGIN

int64_t
CollationRootElements::lastCEWithPrimaryBefore(uint32_t p) const {
    if(p == 0) { return 0; }
    U_ASSERT(p > elements[elements[IX_FIRST_PRIMARY_INDEX]]);
    int32_t index = findP(p);
    uint32_t q = elements[index];
    uint32_t secTer;
    if(p == (q & 0xffffff00)) {
        // p == elements[index] is a root primary. Find the CE before it.
        // We must not be in a primary range.
        U_ASSERT((q & PRIMARY_STEP_MASK) == 0);
        secTer = elements[index - 1];
        if((secTer & SEC_TER_DELTA_FLAG) == 0) {
            // Primary CE just before p.
            p = secTer & 0xffffff00;
            secTer = Collation::COMMON_SEC_AND_TER_CE;
        } else {
            // secTer = last secondary & tertiary for the previous primary
            index -= 2;
            for(;;) {
                p = elements[index];
                if((p & SEC_TER_DELTA_FLAG) == 0) {
                    p &= 0xffffff00;
                    break;
                }
                --index;
            }
        }
    } else {
        // p > elements[index] which is the previous primary.
        // Find the last secondary & tertiary weights for it.
        p = q & 0xffffff00;
        secTer = Collation::COMMON_SEC_AND_TER_CE;
        for(;;) {
            q = elements[++index];
            if((q & SEC_TER_DELTA_FLAG) == 0) {
                // We must not be in a primary range.
                U_ASSERT((q & PRIMARY_STEP_MASK) == 0);
                break;
            }
            secTer = q;
        }
    }
    return ((int64_t)p << 32) | (secTer & ~SEC_TER_DELTA_FLAG);
}

int64_t
CollationRootElements::firstCEWithPrimaryAtLeast(uint32_t p) const {
    if(p == 0) { return 0; }
    int32_t index = findP(p);
    if(p != (elements[index] & 0xffffff00)) {
        for(;;) {
            p = elements[++index];
            if((p & SEC_TER_DELTA_FLAG) == 0) {
                // First primary after p. We must not be in a primary range.
                U_ASSERT((p & PRIMARY_STEP_MASK) == 0);
                break;
            }
        }
    }
    // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0.
    return ((int64_t)p << 32) | Collation::COMMON_SEC_AND_TER_CE;
}

uint32_t
CollationRootElements::getPrimaryBefore(uint32_t p, UBool isCompressible) const {
    int32_t index = findPrimary(p);
    int32_t step;
    uint32_t q = elements[index];
    if(p == (q & 0xffffff00)) {
        // Found p itself. Return the previous primary.
        // See if p is at the end of a previous range.
        step = (int32_t)q & PRIMARY_STEP_MASK;
        if(step == 0) {
            // p is not at the end of a range. Look for the previous primary.
            do {
                p = elements[--index];
            } while((p & SEC_TER_DELTA_FLAG) != 0);
            return p & 0xffffff00;
        }
    } else {
        // p is in a range, and not at the start.
        uint32_t nextElement = elements[index + 1];
        U_ASSERT(isEndOfPrimaryRange(nextElement));
        step = (int32_t)nextElement & PRIMARY_STEP_MASK;
    }
    // Return the previous range primary.
    if((p & 0xffff) == 0) {
        return Collation::decTwoBytePrimaryByOneStep(p, isCompressible, step);
    } else {
        return Collation::decThreeBytePrimaryByOneStep(p, isCompressible, step);
    }
}

uint32_t
CollationRootElements::getSecondaryBefore(uint32_t p, uint32_t s) const {
    int32_t index;
    uint32_t previousSec, sec;
    if(p == 0) {
        index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
        // Gap at the beginning of the secondary CE range.
        previousSec = 0;
        sec = elements[index] >> 16;
    } else {
        index = findPrimary(p) + 1;
        previousSec = Collation::BEFORE_WEIGHT16;
        sec = getFirstSecTerForPrimary(index) >> 16;
    }
    U_ASSERT(s >= sec);
    while(s > sec) {
        previousSec = sec;
        U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0);
        sec = elements[index++] >> 16;
    }
    U_ASSERT(sec == s);
    return previousSec;
}

uint32_t
CollationRootElements::getTertiaryBefore(uint32_t p, uint32_t s, uint32_t t) const {
    U_ASSERT((t & ~Collation::ONLY_TERTIARY_MASK) == 0);
    int32_t index;
    uint32_t previousTer, secTer;
    if(p == 0) {
        if(s == 0) {
            index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX];
            // Gap at the beginning of the tertiary CE range.
            previousTer = 0;
        } else {
            index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
            previousTer = Collation::BEFORE_WEIGHT16;
        }
        secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
    } else {
        index = findPrimary(p) + 1;
        previousTer = Collation::BEFORE_WEIGHT16;
        secTer = getFirstSecTerForPrimary(index);
    }
    uint32_t st = (s << 16) | t;
    while(st > secTer) {
        if((secTer >> 16) == s) { previousTer = secTer; }
        U_ASSERT((elements[index] & SEC_TER_DELTA_FLAG) != 0);
        secTer = elements[index++] & ~SEC_TER_DELTA_FLAG;
    }
    U_ASSERT(secTer == st);
    return previousTer & 0xffff;
}

uint32_t
CollationRootElements::getPrimaryAfter(uint32_t p, int32_t index, UBool isCompressible) const {
    U_ASSERT(p == (elements[index] & 0xffffff00) || isEndOfPrimaryRange(elements[index + 1]));
    uint32_t q = elements[++index];
    int32_t step;
    if((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int32_t)q & PRIMARY_STEP_MASK) != 0) {
        // Return the next primary in this range.
        if((p & 0xffff) == 0) {
            return Collation::incTwoBytePrimaryByOffset(p, isCompressible, step);
        } else {
            return Collation::incThreeBytePrimaryByOffset(p, isCompressible, step);
        }
    } else {
        // Return the next primary in the list.
        while((q & SEC_TER_DELTA_FLAG) != 0) {
            q = elements[++index];
        }
        U_ASSERT((q & PRIMARY_STEP_MASK) == 0);
        return q;
    }
}

uint32_t
CollationRootElements::getSecondaryAfter(int32_t index, uint32_t s) const {
    uint32_t secTer;
    uint32_t secLimit;
    if(index == 0) {
        // primary = 0
        U_ASSERT(s != 0);
        index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
        secTer = elements[index];
        // Gap at the end of the secondary CE range.
        secLimit = 0x10000;
    } else {
        U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
        secTer = getFirstSecTerForPrimary(index + 1);
        // If this is an explicit sec/ter unit, then it will be read once more.
        // Gap for secondaries of primary CEs.
        secLimit = getSecondaryBoundary();
    }
    for(;;) {
        uint32_t sec = secTer >> 16;
        if(sec > s) { return sec; }
        secTer = elements[++index];
        if((secTer & SEC_TER_DELTA_FLAG) == 0) { return secLimit; }
    }
}

uint32_t
CollationRootElements::getTertiaryAfter(int32_t index, uint32_t s, uint32_t t) const {
    uint32_t secTer;
    uint32_t terLimit;
    if(index == 0) {
        // primary = 0
        if(s == 0) {
            U_ASSERT(t != 0);
            index = (int32_t)elements[IX_FIRST_TERTIARY_INDEX];
            // Gap at the end of the tertiary CE range.
            terLimit = 0x4000;
        } else {
            index = (int32_t)elements[IX_FIRST_SECONDARY_INDEX];
            // Gap for tertiaries of primary/secondary CEs.
            terLimit = getTertiaryBoundary();
        }
        secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
    } else {
        U_ASSERT(index >= (int32_t)elements[IX_FIRST_PRIMARY_INDEX]);
        secTer = getFirstSecTerForPrimary(index + 1);
        // If this is an explicit sec/ter unit, then it will be read once more.
        terLimit = getTertiaryBoundary();
    }
    uint32_t st = (s << 16) | t;
    for(;;) {
        if(secTer > st) {
            U_ASSERT((secTer >> 16) == s);
            return secTer & 0xffff;
        }
        secTer = elements[++index];
        // No tertiary greater than t for this primary+secondary.
        if((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) { return terLimit; }
        secTer &= ~SEC_TER_DELTA_FLAG;
    }
}

uint32_t
CollationRootElements::getFirstSecTerForPrimary(int32_t index) const {
    uint32_t secTer = elements[index];
    if((secTer & SEC_TER_DELTA_FLAG) == 0) {
        // No sec/ter delta.
        return Collation::COMMON_SEC_AND_TER_CE;
    }
    secTer &= ~SEC_TER_DELTA_FLAG;
    if(secTer > Collation::COMMON_SEC_AND_TER_CE) {
        // Implied sec/ter.
        return Collation::COMMON_SEC_AND_TER_CE;
    }
    // Explicit sec/ter below common/common.
    return secTer;
}

int32_t
CollationRootElements::findPrimary(uint32_t p) const {
    // Requirement: p must occur as a root primary.
    U_ASSERT((p & 0xff) == 0);  // at most a 3-byte primary
    int32_t index = findP(p);
    // If p is in a range, then we just assume that p is an actual primary in this range.
    // (Too cumbersome/expensive to check.)
    // Otherwise, it must be an exact match.
    U_ASSERT(isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00));
    return index;
}

int32_t
CollationRootElements::findP(uint32_t p) const {
    // p need not occur as a root primary.
    // For example, it might be a reordering group boundary.
    U_ASSERT((p >> 24) != Collation::UNASSIGNED_IMPLICIT_BYTE);
    // modified binary search
    int32_t start = (int32_t)elements[IX_FIRST_PRIMARY_INDEX];
    U_ASSERT(p >= elements[start]);
    int32_t limit = length - 1;
    U_ASSERT(elements[limit] >= PRIMARY_SENTINEL);
    U_ASSERT(p < elements[limit]);
    while((start + 1) < limit) {
        // Invariant: elements[start] and elements[limit] are primaries,
        // and elements[start]<=p<=elements[limit].
        int32_t i = (start + limit) / 2;
        uint32_t q = elements[i];
        if((q & SEC_TER_DELTA_FLAG) != 0) {
            // Find the next primary.
            int32_t j = i + 1;
            for(;;) {
                if(j == limit) { break; }
                q = elements[j];
                if((q & SEC_TER_DELTA_FLAG) == 0) {
                    i = j;
                    break;
                }
                ++j;
            }
            if((q & SEC_TER_DELTA_FLAG) != 0) {
                // Find the preceding primary.
                j = i - 1;
                for(;;) {
                    if(j == start) { break; }
                    q = elements[j];
                    if((q & SEC_TER_DELTA_FLAG) == 0) {
                        i = j;
                        break;
                    }
                    --j;
                }
                if((q & SEC_TER_DELTA_FLAG) != 0) {
                    // No primary between start and limit.
                    break;
                }
            }
        }
        if(p < (q & 0xffffff00)) {  // Reset the "step" bits of a range end primary.
            limit = i;
        } else {
            start = i;
        }
    }
    return start;
}

U_NAMESPACE_END

#endif  // !UCONFIG_NO_COLLATION