ucmp8.c   [plain text]


/*
********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2004, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*/

#include "ucmp8.h"
#include "cmemory.h"

/* internal constants*/


U_CAPI int32_t U_EXPORT2
ucmp8_getkUnicodeCount() { return UCMP8_kUnicodeCount;}

U_CAPI int32_t U_EXPORT2
ucmp8_getkBlockCount() { return UCMP8_kBlockCount;}

U_CAPI void U_EXPORT2
ucmp8_initBogus(CompactByteArray* array)
{
    CompactByteArray* this_obj = array;

    if (this_obj == NULL) return;

    this_obj->fStructSize = sizeof(CompactByteArray);
    this_obj->fArray = NULL;
    this_obj->fIndex = NULL;
    this_obj->fCount = UCMP8_kUnicodeCount;
    this_obj->fCompact = FALSE;
    this_obj->fBogus = TRUE;
    this_obj->fAlias = FALSE;
    this_obj->fIAmOwned = TRUE;
}

/* debug flags*/
/*=======================================================*/
U_CAPI void U_EXPORT2
ucmp8_init(CompactByteArray* array, int8_t defaultValue)
{
/* set up the index array and the data array.
 * the index array always points into particular parts of the data array
 * it is initially set up to point at regular block boundaries
 * The following example uses blocks of 4 for simplicity
 * Example: Expanded
 * INDEX# 0   1   2   3   4
 * INDEX  0   4   8   12  16 ...
 * ARRAY  abcdeababcedzyabcdea...
 *        |   |   |   |   |   |...
 * whenever you set an element in the array, it unpacks to this state
 * After compression, the index will point to various places in the data array
 * wherever there is a runs of the same elements as in the original
 * Example: Compressed
 * INDEX# 0   1   2   3   4
 * INDEX  0   4   1   8   2 ...
 * ARRAY  abcdeabazyabc...
 * If you look at the example, index# 2 in the expanded version points
 * to data position number 8, which has elements "bced". In the compressed
 * version, index# 2 points to data position 1, which also has "bced"
 */
    CompactByteArray* this_obj = array;
    int32_t i;

    if (this_obj == NULL) return;

    this_obj->fStructSize = sizeof(CompactByteArray);
    this_obj->fArray = NULL;
    this_obj->fIndex = NULL;
    this_obj->fCount = UCMP8_kUnicodeCount;
    this_obj->fCompact = FALSE;
    this_obj->fBogus = FALSE;
    this_obj->fAlias = FALSE;
    this_obj->fIAmOwned = TRUE;


    this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
    if (!this_obj->fArray)
    {
        this_obj->fBogus = TRUE;
        return;
    }
    this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
    if (!this_obj->fIndex)
    {
        uprv_free(this_obj->fArray);
        this_obj->fArray = NULL;
        this_obj->fBogus = TRUE;
        return;
    }
    for (i = 0; i < UCMP8_kUnicodeCount; ++i)
    {
        this_obj->fArray[i] = defaultValue;
    }
    for (i = 0; i < UCMP8_kIndexCount; ++i)
    {
        this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
    }
}

U_CAPI CompactByteArray* U_EXPORT2
ucmp8_open(int8_t defaultValue)
{
/* set up the index array and the data array.
 * the index array always points into particular parts of the data array
 * it is initially set up to point at regular block boundaries
 * The following example uses blocks of 4 for simplicity
 * Example: Expanded
 * INDEX# 0   1   2   3   4
 * INDEX  0   4   8   12  16 ...
 * ARRAY  abcdeababcedzyabcdea...
 *        |   |   |   |   |   |...
 * whenever you set an element in the array, it unpacks to this state
 * After compression, the index will point to various places in the data array
 * wherever there is a runs of the same elements as in the original
 * Example: Compressed
 * INDEX# 0   1   2   3   4
 * INDEX  0   4   1   8   2 ...
 * ARRAY  abcdeabazyabc...
 * If you look at the example, index# 2 in the expanded version points
 * to data position number 8, which has elements "bced". In the compressed
 * version, index# 2 points to data position 1, which also has "bced"
 */
    CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
    int32_t i;

    if (this_obj == NULL) return NULL;

    this_obj->fStructSize = sizeof(CompactByteArray);
    this_obj->fArray = NULL;
    this_obj->fIndex = NULL;
    this_obj->fCount = UCMP8_kUnicodeCount;
    this_obj->fCompact = FALSE;
    this_obj->fBogus = FALSE;
    this_obj->fAlias = FALSE;
    this_obj->fIAmOwned = FALSE;


    this_obj->fArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
    if (!this_obj->fArray)
    {
        this_obj->fBogus = TRUE;
        return NULL;
    }
    this_obj->fIndex = (uint16_t*) uprv_malloc(sizeof(uint16_t) * UCMP8_kIndexCount);
    if (!this_obj->fIndex)
    {
        uprv_free(this_obj->fArray);
        this_obj->fArray = NULL;
        this_obj->fBogus = TRUE;
        return NULL;
    }
    for (i = 0; i < UCMP8_kUnicodeCount; ++i)
    {
        this_obj->fArray[i] = defaultValue;
    }
    for (i = 0; i < UCMP8_kIndexCount; ++i)
    {
        this_obj->fIndex[i] = (uint16_t)(i << UCMP8_kBlockShift);
    }

    return this_obj;
}

U_CAPI CompactByteArray* U_EXPORT2
ucmp8_openAdopt(uint16_t *indexArray,
                  int8_t *newValues,
                  int32_t count)
{
    CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
    /* test for NULL */
    if(this_obj == NULL)
        return NULL;
    ucmp8_initAdopt(this_obj, indexArray, newValues, count);
    this_obj->fIAmOwned = FALSE;
    return this_obj;
}

U_CAPI CompactByteArray* U_EXPORT2
ucmp8_openAlias(uint16_t *indexArray,
                  int8_t *newValues,
                  int32_t count)
{
    CompactByteArray* this_obj = (CompactByteArray*) uprv_malloc(sizeof(CompactByteArray));
    /* test for NULL */
    if(this_obj == NULL)
        return NULL;
    ucmp8_initAlias(this_obj, indexArray, newValues, count);
    this_obj->fIAmOwned = FALSE;
    return this_obj;
}

/*=======================================================*/

U_CAPI CompactByteArray* U_EXPORT2
ucmp8_initAdopt(CompactByteArray *this_obj,
                  uint16_t *indexArray,
                  int8_t *newValues,
                  int32_t count)
{
    if (this_obj) {
        this_obj->fCount = count;
        this_obj->fBogus = FALSE;
        this_obj->fStructSize = sizeof(CompactByteArray);

        this_obj->fArray = newValues;
        this_obj->fIndex = indexArray;
        this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
        this_obj->fAlias = FALSE;
        this_obj->fIAmOwned = TRUE;
    }

    return this_obj;
}

U_CAPI CompactByteArray* U_EXPORT2
ucmp8_initAlias(CompactByteArray *this_obj,
                  uint16_t *indexArray,
                  int8_t *newValues,
                  int32_t count)
{
    if (this_obj) {
        this_obj->fArray = NULL;
        this_obj->fIndex = NULL;
        this_obj->fCount = count;
        this_obj->fBogus = FALSE;
        this_obj->fStructSize = sizeof(CompactByteArray);

        this_obj->fArray = newValues;
        this_obj->fIndex = indexArray;
        this_obj->fCompact = (UBool)((count < UCMP8_kUnicodeCount) ? TRUE : FALSE);
        this_obj->fAlias = TRUE;
        this_obj->fIAmOwned = TRUE;
    }

    return this_obj;
}

/*=======================================================*/

U_CAPI void U_EXPORT2
ucmp8_close(CompactByteArray* this_obj)
{
    if(this_obj != NULL) {
        if(!this_obj->fAlias) {
            if(this_obj->fArray != NULL) {
                uprv_free(this_obj->fArray);
            }
            if(this_obj->fIndex != NULL) {
                uprv_free(this_obj->fIndex);
            }
        }
        if(!this_obj->fIAmOwned) /* Called if 'init' was called instead of 'open'. */
        {
            uprv_free(this_obj);
        }
    }
}


/*=======================================================*/

U_CAPI void U_EXPORT2
ucmp8_expand(CompactByteArray* this_obj)
{
    /* can optimize later.
     * if we have to expand, then walk through the blocks instead of using Get
     * this code unpacks the array by copying the blocks to the normalized position.
     * Example: Compressed
     * INDEX# 0   1   2   3   4
     * INDEX  0   4   1   8   2 ...
     * ARRAY  abcdeabazyabc...
     *  turns into
     * Example: Expanded
     * INDEX# 0   1   2   3   4
     * INDEX  0   4   8   12  16 ...
     * ARRAY  abcdeababcedzyabcdea...
     */
    int32_t i;
    if (this_obj->fCompact)
    {
      int8_t* tempArray;
      tempArray = (int8_t*) uprv_malloc(sizeof(int8_t) * UCMP8_kUnicodeCount);
      if (!tempArray)
      {
          this_obj->fBogus = TRUE;
          return;
      }
      for (i = 0; i < UCMP8_kUnicodeCount; ++i)
      {
          tempArray[i] = ucmp8_get(this_obj,(UChar)i);  /* HSYS : How expand?*/
      }
      for (i = 0; i < UCMP8_kIndexCount; ++i)
      {
          this_obj->fIndex[i] = (uint16_t)(i<< UCMP8_kBlockShift);
      }
      uprv_free(this_obj->fArray);
      this_obj->fArray = tempArray;
      this_obj->fCompact = FALSE;
      this_obj->fAlias = FALSE;

    }
}


/*=======================================================*/
/* this_obj->fArray:    an array to be overlapped
 * start and count: specify the block to be overlapped
 * tempIndex:   the overlapped array (actually indices back into inputContents)
 * inputHash:   an index of hashes for tempIndex, where
 *      inputHash[i] = XOR of values from i-count+1 to i
 */
static int32_t
findOverlappingPosition(CompactByteArray* this_obj,
            uint32_t start,
            const UChar* tempIndex,
            int32_t tempIndexCount,
            uint32_t cycle)
{
    /* this_obj is a utility routine for finding blocks that overlap.
     * IMPORTANT: the cycle number is very important. Small cycles take a lot
     * longer to work. In some cases, they may be able to get better compaction.
     */

    int32_t i;
    int32_t j;
    int32_t currentCount;

    for (i = 0; i < tempIndexCount; i += cycle)
    {
        currentCount = UCMP8_kBlockCount;
        if (i + UCMP8_kBlockCount > tempIndexCount)
        {
            currentCount = tempIndexCount - i;
        }
        for (j = 0; j < currentCount; ++j)
        {
            if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]])
                break;
        }
        if (j == currentCount)
            break;
    }

    return i;
}

U_CAPI UBool U_EXPORT2
ucmp8_isBogus(const CompactByteArray* this_obj)
{
    return (UBool)(this_obj == NULL || this_obj->fBogus);
}

U_CAPI const int8_t* U_EXPORT2
ucmp8_getArray(const CompactByteArray* this_obj)
{
    return this_obj->fArray;
}

U_CAPI const uint16_t* U_EXPORT2
ucmp8_getIndex(const CompactByteArray* this_obj)
{
    return this_obj->fIndex;
}

U_CAPI int32_t U_EXPORT2
ucmp8_getCount(const CompactByteArray* this_obj)
{
    return this_obj->fCount;
}


U_CAPI void U_EXPORT2
ucmp8_set(CompactByteArray* this_obj,
      UChar c,
      int8_t value)
{
    if (this_obj->fCompact == TRUE)
    {
        ucmp8_expand(this_obj);
        if (this_obj->fBogus) return;
    }
    this_obj->fArray[(int32_t)c] = value;
}


U_CAPI void U_EXPORT2
ucmp8_setRange(CompactByteArray* this_obj,
           UChar start,
           UChar end,
           int8_t value)
{
    int32_t i;
    if (this_obj->fCompact == TRUE)
    {
        ucmp8_expand(this_obj);
        if (this_obj->fBogus)
            return;
    }
    for (i = start; i <= end; ++i)
    {
        this_obj->fArray[i] = value;
    }
}


/*=======================================================*/

U_CAPI void U_EXPORT2
ucmp8_compact(CompactByteArray* this_obj,
          uint32_t cycle)
{
    if (!this_obj->fCompact)
    {
        /* this_obj actually does the compaction.
        * it walks throught the contents of the expanded array, finding the
        * first block in the data that matches the contents of the current index.
        * As it works, it keeps an updated pointer to the last position,
        * so that it knows how big to make the final array
        * If the matching succeeds, then the index will point into the data
        * at some earlier position.
        * If the matching fails, then last position pointer will be bumped,
        * and the index will point to that last block of data.
        */
        UChar*    tempIndex;
        int32_t     tempIndexCount;
        int8_t*     tempArray;
        int32_t     iBlock, iIndex;

        /* fix cycle, must be 0 < cycle <= blockcount*/
        if (cycle <= 0)
            cycle = 1;
        else if (cycle > (uint32_t)UCMP8_kBlockCount)
            cycle = UCMP8_kBlockCount;

        /* make temp storage, larger than we need*/
        tempIndex = (UChar*) uprv_malloc(sizeof(UChar)* UCMP8_kUnicodeCount);
        if (!tempIndex)
        {
            this_obj->fBogus = TRUE;
            return;
        }
        /* set up first block.*/
        tempIndexCount = UCMP8_kBlockCount;
        for (iIndex = 0; iIndex < UCMP8_kBlockCount; ++iIndex)
        {
            tempIndex[iIndex] = (uint16_t)iIndex;
        } /* endfor (iIndex = 0; .....)*/
        this_obj->fIndex[0] = 0;

        /* for each successive block, find out its first position in the compacted array*/
        for (iBlock = 1; iBlock < UCMP8_kIndexCount; ++iBlock)
        {
            int32_t newCount, firstPosition, block;
            block = iBlock << UCMP8_kBlockShift;
            /*      if (debugSmall) if (block > debugSmallLimit) break;*/
            firstPosition = findOverlappingPosition(this_obj,
                block,
                tempIndex,
                tempIndexCount,
                cycle);

            /* if not contained in the current list, copy the remainder
            * invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock
            * we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount]
            */
            newCount = firstPosition + UCMP8_kBlockCount;
            if (newCount > tempIndexCount)
            {
                for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex)
                {
                    tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block);
                } /* endfor (iIndex = tempIndexCount....)*/
                tempIndexCount = newCount;
            } /* endif (newCount > tempIndexCount)*/
            this_obj->fIndex[iBlock] = (uint16_t)firstPosition;
        } /* endfor (iBlock = 1.....)*/

        /* now allocate and copy the items into the array*/
        tempArray = (int8_t*) uprv_malloc(tempIndexCount * sizeof(int8_t));
        if (!tempArray)
        {
            this_obj->fBogus = TRUE;
            uprv_free(tempIndex);
            return;
        }
        for (iIndex = 0; iIndex < tempIndexCount; ++iIndex)
        {
            tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]];
        }
        uprv_free(this_obj->fArray);
        this_obj->fArray = tempArray;
        this_obj->fCount = tempIndexCount;


        /* free up temp storage*/
        uprv_free(tempIndex);
        this_obj->fCompact = TRUE;
    } /* endif (!this_obj->fCompact)*/
}

#define MEMORY_WRITE(destAddr, source, sizeSoFar, len) \
        if (destAddr) {\
            uprv_memcpy(destAddr+sizeSoFar, source, len);\
        }\
        sizeSoFar += (len)

U_CAPI  uint32_t U_EXPORT2 ucmp8_flattenMem (const CompactByteArray* array, uint8_t *MS)
{
    int32_t size = 0;
    static const int32_t version = ICU_UCMP8_VERSION;

    MEMORY_WRITE(MS, &version, size, 4);

    MEMORY_WRITE(MS, &array->fCount, size, 4);

    MEMORY_WRITE(MS, array->fIndex, size, sizeof(array->fIndex[0])*UCMP8_kIndexCount);

    MEMORY_WRITE(MS, array->fArray, size, sizeof(array->fArray[0])*array->fCount);

    while(size%4) /* end padding */
    {
        uint8_t pad = 0;
        MEMORY_WRITE(MS, &pad, size, 1);
    }

    return size;
}

/* We use sizeof(*array), etc so that this code can be as portable as 
   possible between the ucmpX_ family.
*/

U_CAPI  void U_EXPORT2 ucmp8_initFromData(CompactByteArray *this_obj, const uint8_t **source, UErrorCode *status)
{
    uint32_t i;
    const uint8_t *oldSource = *source;

    if(U_FAILURE(*status))
        return;

    this_obj->fArray = NULL;
    this_obj->fIndex = NULL;
    this_obj->fBogus = FALSE;
    this_obj->fStructSize = sizeof(CompactByteArray);
    this_obj->fCompact = TRUE;
    this_obj->fAlias = TRUE;
    this_obj->fIAmOwned = TRUE;

    i = * ((const uint32_t*) *source);
    (*source) += 4;

    if(i != ICU_UCMP8_VERSION)
    {
        *status = U_INVALID_FORMAT_ERROR;
        return;
    }

    this_obj->fCount = * ((const uint32_t*)*source);
    (*source) += 4;

    this_obj->fIndex = (uint16_t*) *source;
    (*source) += sizeof(this_obj->fIndex[0])*UCMP8_kIndexCount;

    this_obj->fArray = (int8_t*) *source;
    (*source) += sizeof(this_obj->fArray[0])*this_obj->fCount;

    /* eat up padding */
    while((*source-(oldSource))%4)
        (*source)++;
}