ucoleitr.cpp   [plain text]


/*
******************************************************************************
*   Copyright (C) 2001-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
******************************************************************************
*
* File ucoleitr.cpp
*
* Modification History:
*
* Date        Name        Description
* 02/15/2001  synwee      Modified all methods to process its own function 
*                         instead of calling the equivalent c++ api (coleitr.h)
******************************************************************************/

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/ucoleitr.h"
#include "unicode/ustring.h"
#include "unicode/sortkey.h"
#include "unicode/uobject.h"
#include "ucol_imp.h"
#include "cmemory.h"

U_NAMESPACE_USE

#define BUFFER_LENGTH             100

#define DEFAULT_BUFFER_SIZE 16
#define BUFFER_GROW 8

#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])

#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))

#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0])

#define DELETE_ARRAY(array) uprv_free((void *) (array))

typedef struct icu::collIterate collIterator;

struct RCEI
{
    uint32_t ce;
    int32_t  low;
    int32_t  high;
};

U_NAMESPACE_BEGIN

struct RCEBuffer
{
    RCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
    RCEI   *buffer;
    int32_t bufferIndex;
    int32_t bufferSize;

    RCEBuffer();
    ~RCEBuffer();

    UBool empty() const;
    void  put(uint32_t ce, int32_t ixLow, int32_t ixHigh);
    const RCEI *get();
};

RCEBuffer::RCEBuffer()
{
    buffer = defaultBuffer;
    bufferIndex = 0;
    bufferSize = DEFAULT_BUFFER_SIZE;
}

RCEBuffer::~RCEBuffer()
{
    if (buffer != defaultBuffer) {
        DELETE_ARRAY(buffer);
    }
}

UBool RCEBuffer::empty() const
{
    return bufferIndex <= 0;
}

void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh)
{
    if (bufferIndex >= bufferSize) {
        RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW);

        ARRAY_COPY(newBuffer, buffer, bufferSize);

        if (buffer != defaultBuffer) {
            DELETE_ARRAY(buffer);
        }

        buffer = newBuffer;
        bufferSize += BUFFER_GROW;
    }

    buffer[bufferIndex].ce   = ce;
    buffer[bufferIndex].low  = ixLow;
    buffer[bufferIndex].high = ixHigh;

    bufferIndex += 1;
}

const RCEI *RCEBuffer::get()
{
    if (bufferIndex > 0) {
     return &buffer[--bufferIndex];
    }

    return NULL;
}

struct PCEI
{
    uint64_t ce;
    int32_t  low;
    int32_t  high;
};

struct PCEBuffer
{
    PCEI    defaultBuffer[DEFAULT_BUFFER_SIZE];
    PCEI   *buffer;
    int32_t bufferIndex;
    int32_t bufferSize;

    PCEBuffer();
    ~PCEBuffer();

    void  reset();
    UBool empty() const;
    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
    const PCEI *get();
};

PCEBuffer::PCEBuffer()
{
    buffer = defaultBuffer;
    bufferIndex = 0;
    bufferSize = DEFAULT_BUFFER_SIZE;
}

PCEBuffer::~PCEBuffer()
{
    if (buffer != defaultBuffer) {
        DELETE_ARRAY(buffer);
    }
}

void PCEBuffer::reset()
{
    bufferIndex = 0;
}

UBool PCEBuffer::empty() const
{
    return bufferIndex <= 0;
}

void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh)
{
    if (bufferIndex >= bufferSize) {
        PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW);

        ARRAY_COPY(newBuffer, buffer, bufferSize);

        if (buffer != defaultBuffer) {
            DELETE_ARRAY(buffer);
        }

        buffer = newBuffer;
        bufferSize += BUFFER_GROW;
    }

    buffer[bufferIndex].ce   = ce;
    buffer[bufferIndex].low  = ixLow;
    buffer[bufferIndex].high = ixHigh;

    bufferIndex += 1;
}

const PCEI *PCEBuffer::get()
{
    if (bufferIndex > 0) {
     return &buffer[--bufferIndex];
    }

    return NULL;
}

/*
 * This inherits from UObject so that
 * it can be allocated by new and the
 * constructor for PCEBuffer is called.
 */
struct UCollationPCE : public UObject
{
    PCEBuffer          pceBuffer;
    UCollationStrength strength;
    UBool              toShift;
    UBool              isShifted;
    uint32_t           variableTop;

    UCollationPCE(UCollationElements *elems);
    ~UCollationPCE();

    void init(const UCollator *coll);

    virtual UClassID getDynamicClassID() const;
    static UClassID getStaticClassID();
};

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE)

UCollationPCE::UCollationPCE(UCollationElements *elems)
{
    init(elems->iteratordata_.coll);
}

void UCollationPCE::init(const UCollator *coll)
{
    UErrorCode status = U_ZERO_ERROR;

    strength    = ucol_getStrength(coll);
    toShift     = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED;
    isShifted   = FALSE;
    variableTop = coll->variableTopValue << 16;
}

UCollationPCE::~UCollationPCE()
{
    // nothing to do
}


U_NAMESPACE_END


inline uint64_t processCE(UCollationElements *elems, uint32_t ce)
{
    uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;

    // This is clean, but somewhat slow...
    // We could apply the mask to ce and then
    // just get all three orders...
    switch(elems->pce->strength) {
    default:
        tertiary = ucol_tertiaryOrder(ce);
        /* note fall-through */

    case UCOL_SECONDARY:
        secondary = ucol_secondaryOrder(ce);
        /* note fall-through */

    case UCOL_PRIMARY:
        primary = ucol_primaryOrder(ce);
    }

    // **** This should probably handle continuations too.  ****
    // **** That means that we need 24 bits for the primary ****
    // **** instead of the 16 that we're currently using.   ****
    // **** So we can lay out the 64 bits as: 24.12.12.16.  ****
    // **** Another complication with continuations is that ****
    // **** the *second* CE is marked as a continuation, so ****
    // **** we always have to peek ahead to know how long   ****
    // **** the primary is...                               ****
    if ((elems->pce->toShift && elems->pce->variableTop > ce && primary != 0)
                || (elems->pce->isShifted && primary == 0)) {

        if (primary == 0) {
            return UCOL_IGNORABLE;
        }

        if (elems->pce->strength >= UCOL_QUATERNARY) {
            quaternary = primary;
        }

        primary = secondary = tertiary = 0;
        elems->pce->isShifted = TRUE;
    } else {
        if (elems->pce->strength >= UCOL_QUATERNARY) {
            quaternary = 0xFFFF;
        }

        elems->pce->isShifted = FALSE;
    }

    return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
}

U_CAPI void U_EXPORT2
uprv_init_pce(const UCollationElements *elems)
{
    if (elems->pce != NULL) {
        elems->pce->init(elems->iteratordata_.coll);
    }
}



/* public methods ---------------------------------------------------- */

U_CAPI UCollationElements* U_EXPORT2
ucol_openElements(const UCollator  *coll,
                  const UChar      *text,
                        int32_t    textLength,
                        UErrorCode *status)
{
    if (U_FAILURE(*status)) {
        return NULL;
    }

    UCollationElements *result = new UCollationElements;
    if (result == NULL) {
        *status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }

    result->reset_ = TRUE;
    result->isWritable = FALSE;
    result->pce = NULL;

    if (text == NULL) {
        textLength = 0;
    }
    uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status);

    return result;
}


U_CAPI void U_EXPORT2
ucol_closeElements(UCollationElements *elems)
{
	if (elems != NULL) {
	  collIterate *ci = &elems->iteratordata_;

	  if (ci->extendCEs) {
		  uprv_free(ci->extendCEs);
	  }

	  if (ci->offsetBuffer) {
		  uprv_free(ci->offsetBuffer);
	  }

	  if (elems->isWritable && elems->iteratordata_.string != NULL)
	  {
		uprv_free((UChar *)elems->iteratordata_.string);
	  }

	  if (elems->pce != NULL) {
		  delete elems->pce;
	  }

	  delete elems;
	}
}

U_CAPI void U_EXPORT2
ucol_reset(UCollationElements *elems)
{
    collIterate *ci = &(elems->iteratordata_);
    elems->reset_   = TRUE;
    ci->pos         = ci->string;
    if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) {
        ci->endp      = ci->string + u_strlen(ci->string);
    }
    ci->CEpos       = ci->toReturn = ci->CEs;
    ci->flags       = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN;
    if (ci->coll->normalizationMode == UCOL_ON) {
        ci->flags |= UCOL_ITER_NORM;
    }

    ci->writableBuffer.remove();
    ci->fcdPosition = NULL;

  //ci->offsetReturn = ci->offsetStore = NULL;
	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
}

U_CAPI void U_EXPORT2
ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status)
{
    if (U_FAILURE(*status)) {
        return;
    }

    if (elems == NULL) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT;
}

U_CAPI int32_t U_EXPORT2
ucol_next(UCollationElements *elems, 
          UErrorCode         *status)
{
    int32_t result;
    if (U_FAILURE(*status)) {
        return UCOL_NULLORDER;
    }

    elems->reset_ = FALSE;

    result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll,
                                     &elems->iteratordata_, 
                                     status);

    if (result == UCOL_NO_MORE_CES) {
        result = UCOL_NULLORDER;
    }
    return result;
}

U_CAPI int64_t U_EXPORT2
ucol_nextProcessed(UCollationElements *elems,
                   int32_t            *ixLow,
                   int32_t            *ixHigh,
                   UErrorCode         *status)
{
    const UCollator *coll = elems->iteratordata_.coll;
    int64_t result = UCOL_IGNORABLE;
    uint32_t low = 0, high = 0;

    if (U_FAILURE(*status)) {
        return UCOL_PROCESSED_NULLORDER;
    }

    if (elems->pce == NULL) {
        elems->pce = new UCollationPCE(elems);
    } else {
        elems->pce->pceBuffer.reset();
    }

    elems->reset_ = FALSE;

    do {
        low = ucol_getOffset(elems);
        uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status);
        high = ucol_getOffset(elems);

        if (ce == UCOL_NO_MORE_CES) {
             result = UCOL_PROCESSED_NULLORDER;
             break;
        }

        result = processCE(elems, ce);
    } while (result == UCOL_IGNORABLE);

    if (ixLow != NULL) {
        *ixLow = low;
    }

    if (ixHigh != NULL) {
        *ixHigh = high;
    }

    return result;
}

U_CAPI int32_t U_EXPORT2
ucol_previous(UCollationElements *elems,
              UErrorCode         *status)
{
    if(U_FAILURE(*status)) {
        return UCOL_NULLORDER;
    }
    else
    {
        int32_t result;

        if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) {
            if (elems->iteratordata_.endp == NULL) {
                elems->iteratordata_.endp = elems->iteratordata_.string + 
                                            u_strlen(elems->iteratordata_.string);
                elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
            }
            elems->iteratordata_.pos = elems->iteratordata_.endp;
            elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
        }

        elems->reset_ = FALSE;

        result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll,
                                         &(elems->iteratordata_), 
                                         status);

        if (result == UCOL_NO_MORE_CES) {
            result = UCOL_NULLORDER;
        }

        return result;
    }
}

U_CAPI int64_t U_EXPORT2
ucol_previousProcessed(UCollationElements *elems,
                   int32_t            *ixLow,
                   int32_t            *ixHigh,
                   UErrorCode         *status)
{
    const UCollator *coll = elems->iteratordata_.coll;
    int64_t result = UCOL_IGNORABLE;
 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
 // UCollationStrength strength = ucol_getStrength(coll);
 //  UBool toShift   = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) ==  UCOL_SHIFTED;
 // uint32_t variableTop = coll->variableTopValue;
    int32_t  low = 0, high = 0;

    if (U_FAILURE(*status)) {
        return UCOL_PROCESSED_NULLORDER;
    }

    if (elems->reset_ && 
        (elems->iteratordata_.pos == elems->iteratordata_.string)) {
        if (elems->iteratordata_.endp == NULL) {
            elems->iteratordata_.endp = elems->iteratordata_.string + 
                                        u_strlen(elems->iteratordata_.string);
            elems->iteratordata_.flags |= UCOL_ITER_HASLEN;
        }

        elems->iteratordata_.pos = elems->iteratordata_.endp;
        elems->iteratordata_.fcdPosition = elems->iteratordata_.endp;
    }

    if (elems->pce == NULL) {
        elems->pce = new UCollationPCE(elems);
    } else {
      //elems->pce->pceBuffer.reset();
    }

    elems->reset_ = FALSE;

    while (elems->pce->pceBuffer.empty()) {
        // buffer raw CEs up to non-ignorable primary
        RCEBuffer rceb;
        uint32_t ce;
        
        // **** do we need to reset rceb, or will it always be empty at this point ****
        do {
            high = ucol_getOffset(elems);
            ce   = ucol_getPrevCE(coll, &elems->iteratordata_, status);
            low  = ucol_getOffset(elems);

            if (ce == UCOL_NO_MORE_CES) {
                if (! rceb.empty()) {
                    break;
                }

                goto finish;
            }

            rceb.put(ce, low, high);
        } while ((ce & UCOL_PRIMARYMASK) == 0);

        // process the raw CEs
        while (! rceb.empty()) {
            const RCEI *rcei = rceb.get();

            result = processCE(elems, rcei->ce);

            if (result != UCOL_IGNORABLE) {
                elems->pce->pceBuffer.put(result, rcei->low, rcei->high);
            }
        }
    }

finish:
    if (elems->pce->pceBuffer.empty()) {
        // **** Is -1 the right value for ixLow, ixHigh? ****
    	if (ixLow != NULL) {
    		*ixLow = -1;
    	}
    	
    	if (ixHigh != NULL) {
    		*ixHigh = -1
    		;
    	}
        return UCOL_PROCESSED_NULLORDER;
    }

    const PCEI *pcei = elems->pce->pceBuffer.get();

    if (ixLow != NULL) {
        *ixLow = pcei->low;
    }

    if (ixHigh != NULL) {
        *ixHigh = pcei->high;
    }

    return pcei->ce;
}

U_CAPI int32_t U_EXPORT2
ucol_getMaxExpansion(const UCollationElements *elems,
                           int32_t            order)
{
    uint8_t result;

#if 0
    UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result);
#else
    const UCollator *coll = elems->iteratordata_.coll;
    const uint32_t *start;
    const uint32_t *limit;
    const uint32_t *mid;
          uint32_t strengthMask = 0;
          uint32_t mOrder = (uint32_t) order;

    switch (coll->strength) 
    {
    default:
        strengthMask |= UCOL_TERTIARYORDERMASK;
        /* fall through */

    case UCOL_SECONDARY:
        strengthMask |= UCOL_SECONDARYORDERMASK;
        /* fall through */

    case UCOL_PRIMARY:
        strengthMask |= UCOL_PRIMARYORDERMASK;
    }

    mOrder &= strengthMask;
    start = (coll)->endExpansionCE;
    limit = (coll)->lastEndExpansionCE;

    while (start < limit - 1) {
        mid = start + ((limit - start) >> 1);
        if (mOrder <= (*mid & strengthMask)) {
          limit = mid;
        } else {
          start = mid;
        }
    }

    // FIXME: with a masked search, there might be more than one hit,
    // so we need to look forward and backward from the match to find all
    // of the hits...
    if ((*start & strengthMask) == mOrder) {
        result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE));
    } else if ((*limit & strengthMask) == mOrder) {
         result = *(coll->expansionCESize + (limit - coll->endExpansionCE));
   } else if ((mOrder & 0xFFFF) == 0x00C0) {
        result = 2;
   } else {
       result = 1;
   }
#endif

    return result;
}
 
U_CAPI void U_EXPORT2
ucol_setText(      UCollationElements *elems,
             const UChar              *text,
                   int32_t            textLength,
                   UErrorCode         *status)
{
    if (U_FAILURE(*status)) {
        return;
    }

    if (elems->isWritable && elems->iteratordata_.string != NULL)
    {
        uprv_free((UChar *)elems->iteratordata_.string);
    }

    if (text == NULL) {
        textLength = 0;
    }

    elems->isWritable = FALSE;
    
    /* free offset buffer to avoid memory leak before initializing. */
    ucol_freeOffsetBuffer(&(elems->iteratordata_));
    /* Ensure that previously allocated extendCEs is freed before setting to NULL. */
    if (elems->iteratordata_.extendCEs != NULL) {
        uprv_free(elems->iteratordata_.extendCEs);
    }
    uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, 
                          &elems->iteratordata_, status);

    elems->reset_   = TRUE;
}

U_CAPI int32_t U_EXPORT2
ucol_getOffset(const UCollationElements *elems)
{
  const collIterate *ci = &(elems->iteratordata_);

  if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) {
      return ci->offsetRepeatValue;
  }

  if (ci->offsetReturn != NULL) {
      return *ci->offsetReturn;
  }

  // while processing characters in normalization buffer getOffset will 
  // return the next non-normalized character. 
  // should be inline with the old implementation since the old codes uses
  // nextDecomp in normalizer which also decomposes the string till the 
  // first base character is found.
  if (ci->flags & UCOL_ITER_INNORMBUF) {
      if (ci->fcdPosition == NULL) {
        return 0;
      }
      return (int32_t)(ci->fcdPosition - ci->string);
  }
  else {
      return (int32_t)(ci->pos - ci->string);
  }
}

U_CAPI void U_EXPORT2
ucol_setOffset(UCollationElements    *elems,
               int32_t           offset,
               UErrorCode            *status)
{
    if (U_FAILURE(*status)) {
        return;
    }

    // this methods will clean up any use of the writable buffer and points to 
    // the original string
    collIterate *ci = &(elems->iteratordata_);
    ci->pos         = ci->string + offset;
    ci->CEpos       = ci->toReturn = ci->CEs;
    if (ci->flags & UCOL_ITER_INNORMBUF) {
        ci->flags = ci->origFlags;
    }
    if ((ci->flags & UCOL_ITER_HASLEN) == 0) {
        ci->endp  = ci->string + u_strlen(ci->string);
        ci->flags |= UCOL_ITER_HASLEN;
    }
    ci->fcdPosition = NULL;
    elems->reset_ = FALSE;

	ci->offsetReturn = NULL;
    ci->offsetStore = ci->offsetBuffer;
	ci->offsetRepeatCount = ci->offsetRepeatValue = 0;
}

U_CAPI int32_t U_EXPORT2
ucol_primaryOrder (int32_t order) 
{
    order &= UCOL_PRIMARYMASK;
    return (order >> UCOL_PRIMARYORDERSHIFT);
}

U_CAPI int32_t U_EXPORT2
ucol_secondaryOrder (int32_t order) 
{
    order &= UCOL_SECONDARYMASK;
    return (order >> UCOL_SECONDARYORDERSHIFT);
}

U_CAPI int32_t U_EXPORT2
ucol_tertiaryOrder (int32_t order) 
{
    return (order & UCOL_TERTIARYMASK);
}


void ucol_freeOffsetBuffer(collIterate *s) {
    if (s != NULL && s->offsetBuffer != NULL) {
        uprv_free(s->offsetBuffer);
        s->offsetBuffer = NULL;
        s->offsetBufferSize = 0;
    }
}

#endif /* #if !UCONFIG_NO_COLLATION */