ucoleitr.cpp   [plain text]


// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*   Copyright (C) 2001-2016, International Business Machines
*   Corporation and others.  All Rights Reserved.
******************************************************************************
*
* File ucoleitr.cpp
*
* Modification History:
*
* Date        Name        Description
* 02/15/2001  synwee      Modified all methods to process its own function 
*                         instead of calling the equivalent c++ api (coleitr.h)
* 2012-2014   markus      Rewritten in C++ again.
******************************************************************************/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/coleitr.h"
#include "unicode/tblcoll.h"
#include "unicode/ucoleitr.h"
#include "unicode/ustring.h"
#include "unicode/sortkey.h"
#include "unicode/uobject.h"
#include "cmemory.h"
#include "usrchimp.h"

U_NAMESPACE_USE

#define BUFFER_LENGTH             100

#define DEFAULT_BUFFER_SIZE 16
#define BUFFER_GROW 8

#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])

#define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))

#define DELETE_ARRAY(array) uprv_free((void *) (array))

struct RCEI
{
    uint32_t ce;
    int32_t  low;
    int32_t  high;
};

U_NAMESPACE_BEGIN

struct RCEBuffer
{
    RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
    RCEI   *buffer;
    int32_t bufferIndex;
    int32_t bufferSize;

    RCEBuffer();
    ~RCEBuffer();

    UBool isEmpty() const;
    void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode);
    const RCEI *get();
};

RCEBuffer::RCEBuffer()
{
    buffer = defaultBuffer;
    bufferIndex = 0;
    bufferSize = UPRV_LENGTHOF(defaultBuffer);
}

RCEBuffer::~RCEBuffer()
{
    if (buffer != defaultBuffer) {
        DELETE_ARRAY(buffer);
    }
}

UBool RCEBuffer::isEmpty() const
{
    return bufferIndex <= 0;
}

void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
{
    if (U_FAILURE(errorCode)) {
        return;
    }
    if (bufferIndex >= bufferSize) {
        RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);
        if (newBuffer == NULL) {
            errorCode = U_MEMORY_ALLOCATION_ERROR;
            return;
        }

        ARRAY_COPY(newBuffer, buffer, bufferSize);

        if (buffer != defaultBuffer) {
            DELETE_ARRAY(buffer);
        }

        buffer = newBuffer;
        bufferSize += BUFFER_GROW;
    }

    buffer[bufferIndex].ce   = ce;
    buffer[bufferIndex].low  = ixLow;
    buffer[bufferIndex].high = ixHigh;

    bufferIndex += 1;
}

const RCEI *RCEBuffer::get()
{
    if (bufferIndex > 0) {
     return &buffer[--bufferIndex];
    }

    return NULL;
}

PCEBuffer::PCEBuffer()
{
    buffer = defaultBuffer;
    bufferIndex = 0;
    bufferSize = UPRV_LENGTHOF(defaultBuffer);
}

PCEBuffer::~PCEBuffer()
{
    if (buffer != defaultBuffer) {
        DELETE_ARRAY(buffer);
    }
}

void PCEBuffer::reset()
{
    bufferIndex = 0;
}

UBool PCEBuffer::isEmpty() const
{
    return bufferIndex <= 0;
}

void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh, UErrorCode &errorCode)
{
    if (U_FAILURE(errorCode)) {
        return;
    }
    if (bufferIndex >= bufferSize) {
        PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);
        if (newBuffer == NULL) {
            errorCode = U_MEMORY_ALLOCATION_ERROR;
            return;
        }

        ARRAY_COPY(newBuffer, buffer, bufferSize);

        if (buffer != defaultBuffer) {
            DELETE_ARRAY(buffer);
        }

        buffer = newBuffer;
        bufferSize += BUFFER_GROW;
    }

    buffer[bufferIndex].ce   = ce;
    buffer[bufferIndex].low  = ixLow;
    buffer[bufferIndex].high = ixHigh;

    bufferIndex += 1;
}

const PCEI *PCEBuffer::get()
{
    if (bufferIndex > 0) {
     return &buffer[--bufferIndex];
    }

    return NULL;
}

UCollationPCE::UCollationPCE(UCollationElements *elems) { init(elems); }

UCollationPCE::UCollationPCE(CollationElementIterator *iter) { init(iter); }

void UCollationPCE::init(UCollationElements *elems) {
    init(CollationElementIterator::fromUCollationElements(elems));
}

void UCollationPCE::init(CollationElementIterator *iter)
{
    cei = iter;
    init(*iter->rbc_);
}

void UCollationPCE::init(const Collator &coll)
{
    UErrorCode status = U_ZERO_ERROR;

    strength    = coll.getAttribute(UCOL_STRENGTH, status);
    toShift     = coll.getAttribute(UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED;
    isShifted   = FALSE;
    variableTop = coll.getVariableTop(status);
}

UCollationPCE::~UCollationPCE()
{
    // nothing to do
}

uint64_t UCollationPCE::processCE(uint32_t ce)
{
    uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;

    // This is clean, but somewhat slow...
    // We could apply the mask to ce and then
    // just get all three orders...
    switch(strength) {
    default:
        tertiary = ucol_tertiaryOrder(ce);
        U_FALLTHROUGH;

    case UCOL_SECONDARY:
        secondary = ucol_secondaryOrder(ce);
        U_FALLTHROUGH;

    case UCOL_PRIMARY:
        primary = ucol_primaryOrder(ce);
    }

    // **** This should probably handle continuations too.  ****
    // **** That means that we need 24 bits for the primary ****
    // **** instead of the 16 that we're currently using.   ****
    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
    // **** Another complication with continuations is that ****
    // **** the *second* CE is marked as a continuation, so ****
    // **** we always have to peek ahead to know how long   ****
    // **** the primary is...                               ****
    if ((toShift && variableTop > ce && primary != 0)
                || (isShifted && primary == 0)) {

        if (primary == 0) {
            return UCOL_IGNORABLE;
        }

        if (strength >= UCOL_QUATERNARY) {
            quaternary = primary;
        }

        primary = secondary = tertiary = 0;
        isShifted = TRUE;
    } else {
        if (strength >= UCOL_QUATERNARY) {
            quaternary = 0xFFFF;
        }

        isShifted = FALSE;
    }

    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
}

U_NAMESPACE_END

/* public methods ---------------------------------------------------- */

U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator  *coll,
                  const UChar      *text,
                        int32_t    textLength,
                        UErrorCode *status)
{
    if (U_FAILURE(*status)) {
        return NULL;
    }
    if (coll == NULL || (text == NULL && textLength != 0)) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
    const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll);
    if (rbc == NULL) {
        *status = U_UNSUPPORTED_ERROR;  // coll is a Collator but not a RuleBasedCollator
        return NULL;
    }

    UnicodeString s((UBool)(textLength < 0), text, textLength);
    CollationElementIterator *cei = rbc->createCollationElementIterator(s);
    if (cei == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }

    return cei->toUCollationElements();
}


U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements *elems)
{
    delete CollationElementIterator::fromUCollationElements(elems);
}

U_CAPI void U_EXPORT2
ucol_reset(UCollationElements *elems)
{
    CollationElementIterator::fromUCollationElements(elems)->reset();
}

U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements *elems, 
          UErrorCode         *status)
{
    if (U_FAILURE(*status)) {
        return UCOL_NULLORDER;
    }

    return CollationElementIterator::fromUCollationElements(elems)->next(*status);
}

// temporarily restore the following removed internal function which is used by Spotlight
U_CAPI int64_t U_EXPORT2
ucol_nextProcessed(UCollationElements *elems,
                   int32_t            *ixLow,
                   int32_t            *ixHigh,
                   UErrorCode         *status)
{
    return (UCollationPCE(elems)).nextProcessed(ixLow, ixHigh, status);
}


U_NAMESPACE_BEGIN

int64_t
UCollationPCE::nextProcessed(
                   int32_t            *ixLow,
                   int32_t            *ixHigh,
                   UErrorCode         *status)
{
    int64_t result = UCOL_IGNORABLE;
    uint32_t low = 0, high = 0;

    if (U_FAILURE(*status)) {
        return UCOL_PROCESSED_NULLORDER;
    }

    pceBuffer.reset();

    do {
        low = cei->getOffset();
        int32_t ce = cei->next(*status);
        high = cei->getOffset();

        if (ce == UCOL_NULLORDER) {
             result = UCOL_PROCESSED_NULLORDER;
             break;
        }

        result = processCE((uint32_t)ce);
    } while (result == UCOL_IGNORABLE);

    if (ixLow != NULL) {
        *ixLow = low;
    }

    if (ixHigh != NULL) {
        *ixHigh = high;
    }

    return result;
}

U_NAMESPACE_END

U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements *elems,
              UErrorCode         *status)
{
    if(U_FAILURE(*status)) {
        return UCOL_NULLORDER;
    }
    return CollationElementIterator::fromUCollationElements(elems)->previous(*status);
}

// temporarily restore the following removed internal function which is used by Spotlight
U_CAPI int64_t U_EXPORT2
ucol_previousProcessed(UCollationElements *elems,
                   int32_t            *ixLow,
                   int32_t            *ixHigh,
                   UErrorCode         *status)
{
    return (UCollationPCE(elems)).previousProcessed(ixLow, ixHigh, status);
}

U_NAMESPACE_BEGIN

int64_t
UCollationPCE::previousProcessed(
                   int32_t            *ixLow,
                   int32_t            *ixHigh,
                   UErrorCode         *status)
{
    int64_t result = UCOL_IGNORABLE;
    int32_t  low = 0, high = 0;

    if (U_FAILURE(*status)) {
        return UCOL_PROCESSED_NULLORDER;
    }

    // pceBuffer.reset();

    while (pceBuffer.isEmpty()) {
        // buffer raw CEs up to non-ignorable primary
        RCEBuffer rceb;
        int32_t ce;
        
        // **** do we need to reset rceb, or will it always be empty at this point ****
        do {
            high = cei->getOffset();
            ce   = cei->previous(*status);
            low  = cei->getOffset();

            if (ce == UCOL_NULLORDER) {
                if (!rceb.isEmpty()) {
                    break;
                }

                goto finish;
            }

            rceb.put((uint32_t)ce, low, high, *status);
        } while (U_SUCCESS(*status) && ((ce & UCOL_PRIMARYORDERMASK) == 0 || isContinuation(ce)));

        // process the raw CEs
        while (U_SUCCESS(*status) && !rceb.isEmpty()) {
            const RCEI *rcei = rceb.get();

            result = processCE(rcei->ce);

            if (result != UCOL_IGNORABLE) {
                pceBuffer.put(result, rcei->low, rcei->high, *status);
            }
        }
        if (U_FAILURE(*status)) {
            return UCOL_PROCESSED_NULLORDER;
        }
    }

finish:
    if (pceBuffer.isEmpty()) {
        // **** Is -1 the right value for ixLow, ixHigh? ****
    	if (ixLow != NULL) {
    		*ixLow = -1;
    	}
    	
    	if (ixHigh != NULL) {
    		*ixHigh = -1
    		;
    	}
        return UCOL_PROCESSED_NULLORDER;
    }

    const PCEI *pcei = pceBuffer.get();

    if (ixLow != NULL) {
        *ixLow = pcei->low;
    }

    if (ixHigh != NULL) {
        *ixHigh = pcei->high;
    }

    return pcei->ce;
}

U_NAMESPACE_END

U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements *elems,
                           int32_t            order)
{
    return CollationElementIterator::fromUCollationElements(elems)->getMaxExpansion(order);

    // TODO: The old code masked the order according to strength and then did a binary search.
    // However this was probably at least partially broken because of the following comment.
    // Still, it might have found a match when this version may not.

    // FIXME: with a masked search, there might be more than one hit,
    // so we need to look forward and backward from the match to find all
    // of the hits...
}

U_CAPI void U_EXPORT2
ucol_setText(      UCollationElements *elems,
             const UChar              *text,
                   int32_t            textLength,
                   UErrorCode         *status)
{
    if (U_FAILURE(*status)) {
        return;
    }

    if ((text == NULL && textLength != 0)) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }
    UnicodeString s((UBool)(textLength < 0), text, textLength);
    return CollationElementIterator::fromUCollationElements(elems)->setText(s, *status);
}

U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements *elems)
{
    return CollationElementIterator::fromUCollationElements(elems)->getOffset();
}

U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements    *elems,
               int32_t           offset,
               UErrorCode            *status)
{
    if (U_FAILURE(*status)) {
        return;
    }

    CollationElementIterator::fromUCollationElements(elems)->setOffset(offset, *status);
}

U_CAPI int32_t U_EXPORT2
ucol_primaryOrder (int32_t order) 
{
    return (order >> 16) & 0xffff;
}

U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder (int32_t order) 
{
    return (order >> 8) & 0xff;
}

U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder (int32_t order) 
{
    return order & 0xff;
}

#endif /* #if !UCONFIG_NO_COLLATION */