ucm.c   [plain text]


/*
*******************************************************************************
*
*   Copyright (C) 2003-2009, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
*******************************************************************************
*   file name:  ucm.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003jun20
*   created by: Markus W. Scherer
*
*   This file reads a .ucm file, stores its mappings and sorts them.
*   It implements handling of Unicode conversion mappings from .ucm files
*   for makeconv, canonucm, rptp2ucm, etc.
*
*   Unicode code point sequences with a length of more than 1,
*   as well as byte sequences with more than 4 bytes or more than one complete
*   character sequence are handled to support m:n mappings.
*/

#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "cstring.h"
#include "cmemory.h"
#include "filestrm.h"
#include "uarrsort.h"
#include "ucnvmbcs.h"
#include "ucnv_bld.h"
#include "ucnv_ext.h"
#include "uparse.h"
#include "ucm.h"
#include <stdio.h>

#if !UCONFIG_NO_CONVERSION

/* -------------------------------------------------------------------------- */

static void
printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
    int32_t j;

    for(j=0; j<m->uLen; ++j) {
        fprintf(f, "<U%04lX>", (long)codePoints[j]);
    }

    fputc(' ', f);

    for(j=0; j<m->bLen; ++j) {
        fprintf(f, "\\x%02X", bytes[j]);
    }

    if(m->f>=0) {
        fprintf(f, " |%u\n", m->f);
    } else {
        fputs("\n", f);
    }
}

U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
    printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
}

U_CAPI void U_EXPORT2
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
    UCMapping *m;
    int32_t i, length;

    m=table->mappings;
    length=table->mappingsLength;
    if(byUnicode) {
        for(i=0; i<length; ++m, ++i) {
            ucm_printMapping(table, m, f);
        }
    } else {
        const int32_t *map=table->reverseMap;
        for(i=0; i<length; ++i) {
            ucm_printMapping(table, m+map[i], f);
        }
    }
}

/* mapping comparisons ------------------------------------------------------ */

static int32_t
compareUnicode(UCMTable *lTable, const UCMapping *l,
               UCMTable *rTable, const UCMapping *r) {
    const UChar32 *lu, *ru;
    int32_t result, i, length;

    if(l->uLen==1 && r->uLen==1) {
        /* compare two single code points */
        return l->u-r->u;
    }

    /* get pointers to the code point sequences */
    lu=UCM_GET_CODE_POINTS(lTable, l);
    ru=UCM_GET_CODE_POINTS(rTable, r);

    /* get the minimum length */
    if(l->uLen<=r->uLen) {
        length=l->uLen;
    } else {
        length=r->uLen;
    }

    /* compare the code points */
    for(i=0; i<length; ++i) {
        result=lu[i]-ru[i];
        if(result!=0) {
            return result;
        }
    }

    /* compare the lengths */
    return l->uLen-r->uLen;
}

static int32_t
compareBytes(UCMTable *lTable, const UCMapping *l,
             UCMTable *rTable, const UCMapping *r,
             UBool lexical) {
    const uint8_t *lb, *rb;
    int32_t result, i, length;

    /*
     * A lexical comparison is used for sorting in the builder, to allow
     * an efficient search for a byte sequence that could be a prefix
     * of a previously entered byte sequence.
     *
     * Comparing by lengths first is for compatibility with old .ucm tools
     * like canonucm and rptp2ucm.
     */
    if(lexical) {
        /* get the minimum length and continue */
        if(l->bLen<=r->bLen) {
            length=l->bLen;
        } else {
            length=r->bLen;
        }
    } else {
        /* compare lengths first */
        result=l->bLen-r->bLen;
        if(result!=0) {
            return result;
        } else {
            length=l->bLen;
        }
    }

    /* get pointers to the byte sequences */
    lb=UCM_GET_BYTES(lTable, l);
    rb=UCM_GET_BYTES(rTable, r);

    /* compare the bytes */
    for(i=0; i<length; ++i) {
        result=lb[i]-rb[i];
        if(result!=0) {
            return result;
        }
    }

    /* compare the lengths */
    return l->bLen-r->bLen;
}

/* compare UCMappings for sorting */
static int32_t
compareMappings(UCMTable *lTable, const UCMapping *l,
                UCMTable *rTable, const UCMapping *r,
                UBool uFirst) {
    int32_t result;

    /* choose which side to compare first */
    if(uFirst) {
        /* Unicode then bytes */
        result=compareUnicode(lTable, l, rTable, r);
        if(result==0) {
            result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
        }
    } else {
        /* bytes then Unicode */
        result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
        if(result==0) {
            result=compareUnicode(lTable, l, rTable, r);
        }
    }

    if(result!=0) {
        return result;
    }

    /* compare the flags */
    return l->f-r->f;
}

/* sorting by Unicode first sorts mappings directly */
static int32_t
compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
    return compareMappings(
        (UCMTable *)context, (const UCMapping *)left,
        (UCMTable *)context, (const UCMapping *)right, TRUE);
}

/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
static int32_t
compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
    UCMTable *table=(UCMTable *)context;
    int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
    return compareMappings(
        table, table->mappings+l,
        table, table->mappings+r, FALSE);
}

U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable *t) {
    UErrorCode errorCode;
    int32_t i;

    if(t->isSorted) {
        return;
    }

    errorCode=U_ZERO_ERROR;

    /* 1. sort by Unicode first */
    uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
                   compareMappingsUnicodeFirst, t,
                   FALSE, &errorCode);

    /* build the reverseMap */
    if(t->reverseMap==NULL) {
        /*
         * allocate mappingsCapacity instead of mappingsLength so that
         * if mappings are added, the reverseMap need not be
         * reallocated each time
         * (see ucm_moveMappings() and ucm_addMapping())
         */
        t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
        if(t->reverseMap==NULL) {
            fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
            exit(U_MEMORY_ALLOCATION_ERROR);
        }
    }
    for(i=0; i<t->mappingsLength; ++i) {
        t->reverseMap[i]=i;
    }

    /* 2. sort reverseMap by mappings bytes first */
    uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
                   compareMappingsBytesFirst, t,
                   FALSE, &errorCode);

    if(U_FAILURE(errorCode)) {
        fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
                u_errorName(errorCode));
        exit(errorCode);
    }

    t->isSorted=TRUE;
}

/*
 * remove mappings with their move flag set from the base table
 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table
 */
U_CAPI void U_EXPORT2
ucm_moveMappings(UCMTable *base, UCMTable *ext) {
    UCMapping *mb, *mbLimit;
    int8_t flag;

    mb=base->mappings;
    mbLimit=mb+base->mappingsLength;

    while(mb<mbLimit) {
        flag=mb->moveFlag;
        if(flag!=0) {
            /* reset the move flag */
            mb->moveFlag=0;

            if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) {
                /* add the mapping to the extension table */
                ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
            }

            /* remove this mapping: move the last base mapping down and overwrite the current one */
            if(mb<(mbLimit-1)) {
                uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
            }
            --mbLimit;
            --base->mappingsLength;
            base->isSorted=FALSE;
        } else {
            ++mb;
        }
    }
}

enum {
    NEEDS_MOVE=1,
    HAS_ERRORS=2
};

static uint8_t
checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
                    UBool moveToExt, UBool intersectBase) {
    UCMapping *mb, *me, *mbLimit, *meLimit;
    int32_t cmp;
    uint8_t result;

    mb=base->mappings;
    mbLimit=mb+base->mappingsLength;

    me=ext->mappings;
    meLimit=me+ext->mappingsLength;

    result=0;

    for(;;) {
        /* skip irrelevant mappings on both sides */
        for(;;) {
            if(mb==mbLimit) {
                return result;
            }

            if(0<=mb->f && mb->f<=2) {
                break;
            }

            ++mb;
        }

        for(;;) {
            if(me==meLimit) {
                return result;
            }

            if(0<=me->f && me->f<=2) {
                break;
            }

            ++me;
        }

        /* compare the base and extension mappings */
        cmp=compareUnicode(base, mb, ext, me);
        if(cmp<0) {
            if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
                /*
                 * mapping in base but not in ext, move it
                 *
                 * if ext is DBCS, move DBCS mappings here
                 * and check SBCS ones for Unicode prefix below
                 */
                mb->moveFlag|=UCM_MOVE_TO_EXT;
                result|=NEEDS_MOVE;

            /* does mb map from an input sequence that is a prefix of me's? */
            } else if( mb->uLen<me->uLen &&
                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
            ) {
                if(moveToExt) {
                    /* mark this mapping to be moved to the extension table */
                    mb->moveFlag|=UCM_MOVE_TO_EXT;
                    result|=NEEDS_MOVE;
                } else {
                    fprintf(stderr,
                            "ucm error: the base table contains a mapping whose input sequence\n"
                            "           is a prefix of the input sequence of an extension mapping\n");
                    ucm_printMapping(base, mb, stderr);
                    ucm_printMapping(ext, me, stderr);
                    result|=HAS_ERRORS;
                }
            }

            ++mb;
        } else if(cmp==0) {
            /*
             * same output: remove the extension mapping,
             * otherwise treat as an error
             */
            if( mb->f==me->f && mb->bLen==me->bLen &&
                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
            ) {
                me->moveFlag|=UCM_REMOVE_MAPPING;
                result|=NEEDS_MOVE;
            } else if(intersectBase) {
                /* mapping in base but not in ext, move it */
                mb->moveFlag|=UCM_MOVE_TO_EXT;
                result|=NEEDS_MOVE;
            } else {
                fprintf(stderr,
                        "ucm error: the base table contains a mapping whose input sequence\n"
                        "           is the same as the input sequence of an extension mapping\n"
                        "           but it maps differently\n");
                ucm_printMapping(base, mb, stderr);
                ucm_printMapping(ext, me, stderr);
                result|=HAS_ERRORS;
            }

            ++mb;
        } else /* cmp>0 */ {
            ++me;
        }
    }
}

static uint8_t
checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
                  UBool moveToExt, UBool intersectBase) {
    UCMapping *mb, *me;
    int32_t *baseMap, *extMap;
    int32_t b, e, bLimit, eLimit, cmp;
    uint8_t result;
    UBool isSISO;

    baseMap=base->reverseMap;
    extMap=ext->reverseMap;

    b=e=0;
    bLimit=base->mappingsLength;
    eLimit=ext->mappingsLength;

    result=0;

    isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);

    for(;;) {
        /* skip irrelevant mappings on both sides */
        for(;; ++b) {
            if(b==bLimit) {
                return result;
            }
            mb=base->mappings+baseMap[b];

            if(intersectBase==2 && mb->bLen==1) {
                /*
                 * comparing a base against a DBCS extension:
                 * leave SBCS base mappings alone
                 */
                continue;
            }

            if(mb->f==0 || mb->f==3) {
                break;
            }
        }

        for(;;) {
            if(e==eLimit) {
                return result;
            }
            me=ext->mappings+extMap[e];

            if(me->f==0 || me->f==3) {
                break;
            }

            ++e;
        }

        /* compare the base and extension mappings */
        cmp=compareBytes(base, mb, ext, me, TRUE);
        if(cmp<0) {
            if(intersectBase) {
                /* mapping in base but not in ext, move it */
                mb->moveFlag|=UCM_MOVE_TO_EXT;
                result|=NEEDS_MOVE;

            /*
             * does mb map from an input sequence that is a prefix of me's?
             * for SI/SO tables, a single byte is never a prefix because it
             * occurs in a separate single-byte state
             */
            } else if( mb->bLen<me->bLen &&
                (!isSISO || mb->bLen>1) &&
                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
            ) {
                if(moveToExt) {
                    /* mark this mapping to be moved to the extension table */
                    mb->moveFlag|=UCM_MOVE_TO_EXT;
                    result|=NEEDS_MOVE;
                } else {
                    fprintf(stderr,
                            "ucm error: the base table contains a mapping whose input sequence\n"
                            "           is a prefix of the input sequence of an extension mapping\n");
                    ucm_printMapping(base, mb, stderr);
                    ucm_printMapping(ext, me, stderr);
                    result|=HAS_ERRORS;
                }
            }

            ++b;
        } else if(cmp==0) {
            /*
             * same output: remove the extension mapping,
             * otherwise treat as an error
             */
            if( mb->f==me->f && mb->uLen==me->uLen &&
                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
            ) {
                me->moveFlag|=UCM_REMOVE_MAPPING;
                result|=NEEDS_MOVE;
            } else if(intersectBase) {
                /* mapping in base but not in ext, move it */
                mb->moveFlag|=UCM_MOVE_TO_EXT;
                result|=NEEDS_MOVE;
            } else {
                fprintf(stderr,
                        "ucm error: the base table contains a mapping whose input sequence\n"
                        "           is the same as the input sequence of an extension mapping\n"
                        "           but it maps differently\n");
                ucm_printMapping(base, mb, stderr);
                ucm_printMapping(ext, me, stderr);
                result|=HAS_ERRORS;
            }

            ++b;
        } else /* cmp>0 */ {
            ++e;
        }
    }
}

U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
    UCMapping *m, *mLimit;
    int32_t count;
    UBool isOK;

    m=table->mappings;
    mLimit=m+table->mappingsLength;
    isOK=TRUE;

    while(m<mLimit) {
        count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
        if(count<1) {
            ucm_printMapping(table, m, stderr);
            isOK=FALSE;
        }
        ++m;
    }

    return isOK;
}

U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates *baseStates,
                 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
                 UBool intersectBase) {
    uint8_t result;

    /* if we have an extension table, we must always use precision flags */
    if(base->flagsType&UCM_FLAGS_IMPLICIT) {
        fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
        return FALSE;
    }
    if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
        fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
        return FALSE;
    }

    /* checking requires both tables to be sorted */
    ucm_sortTable(base);
    ucm_sortTable(ext);

    /* check */
    result=
        checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
        checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);

    if(result&HAS_ERRORS) {
        return FALSE;
    }

    if(result&NEEDS_MOVE) {
        ucm_moveMappings(ext, NULL);
        ucm_moveMappings(base, moveTarget);
        ucm_sortTable(base);
        ucm_sortTable(ext);
        if(moveTarget!=NULL) {
            ucm_sortTable(moveTarget);
        }
    }

    return TRUE;
}

/* merge tables for rptp2ucm ------------------------------------------------ */

U_CAPI void U_EXPORT2
ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
                const uint8_t *subchar, int32_t subcharLength,
                uint8_t subchar1) {
    UCMapping *fromUMapping, *toUMapping;
    int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;

    ucm_sortTable(fromUTable);
    ucm_sortTable(toUTable);

    fromUMapping=fromUTable->mappings;
    toUMapping=toUTable->mappings;

    fromUTop=fromUTable->mappingsLength;
    toUTop=toUTable->mappingsLength;

    fromUIndex=toUIndex=0;

    while(fromUIndex<fromUTop && toUIndex<toUTop) {
        cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
        if(cmp==0) {
            /* equal: roundtrip, nothing to do (flags are initially 0) */
            ++fromUMapping;
            ++toUMapping;

            ++fromUIndex;
            ++toUIndex;
        } else if(cmp<0) {
            /*
             * the fromU mapping does not have a toU counterpart:
             * fallback Unicode->codepage
             */
            if( (fromUMapping->bLen==subcharLength &&
                 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
                (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
            ) {
                fromUMapping->f=2; /* SUB mapping */
            } else {
                fromUMapping->f=1; /* normal fallback */
            }

            ++fromUMapping;
            ++fromUIndex;
        } else {
            /*
             * the toU mapping does not have a fromU counterpart:
             * (reverse) fallback codepage->Unicode, copy it to the fromU table
             */

            /* ignore reverse fallbacks to Unicode SUB */
            if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
                toUMapping->f=3; /* reverse fallback */
                ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));

                /* the table may have been reallocated */
                fromUMapping=fromUTable->mappings+fromUIndex;
            }

            ++toUMapping;
            ++toUIndex;
        }
    }

    /* either one or both tables are exhausted */
    while(fromUIndex<fromUTop) {
        /* leftover fromU mappings are fallbacks */
        if( (fromUMapping->bLen==subcharLength &&
             0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
            (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
        ) {
            fromUMapping->f=2; /* SUB mapping */
        } else {
            fromUMapping->f=1; /* normal fallback */
        }

        ++fromUMapping;
        ++fromUIndex;
    }

    while(toUIndex<toUTop) {
        /* leftover toU mappings are reverse fallbacks */

        /* ignore reverse fallbacks to Unicode SUB */
        if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
            toUMapping->f=3; /* reverse fallback */
            ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
        }

        ++toUMapping;
        ++toUIndex;
    }

    fromUTable->isSorted=FALSE;
}

/* separate extension mappings out of base table for rptp2ucm --------------- */

U_CAPI UBool U_EXPORT2
ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
    UCMTable *table;
    UCMapping *m, *mLimit;
    int32_t type;
    UBool needsMove, isOK;

    table=ucm->base;
    m=table->mappings;
    mLimit=m+table->mappingsLength;

    needsMove=FALSE;
    isOK=TRUE;

    for(; m<mLimit; ++m) {
        if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
            fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
            ucm_printMapping(table, m, stderr);
            m->moveFlag|=UCM_REMOVE_MAPPING;
            needsMove=TRUE;
            continue;
        }

        type=ucm_mappingType(
                &ucm->states, m,
                UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
        if(type<0) {
            /* illegal byte sequence */
            printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
            isOK=FALSE;
        } else if(type>0) {
            m->moveFlag|=UCM_MOVE_TO_EXT;
            needsMove=TRUE;
        }
    }

    if(!isOK) {
        return FALSE;
    }
    if(needsMove) {
        ucm_moveMappings(ucm->base, ucm->ext);
        return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
    } else {
        ucm_sortTable(ucm->base);
        return TRUE;
    }
}

/* ucm parser --------------------------------------------------------------- */

U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
    const char *s=*ps;
    char *end;
    uint8_t byte;
    int8_t bLen;

    bLen=0;
    for(;;) {
        /* skip an optional plus sign */
        if(bLen>0 && *s=='+') {
            ++s;
        }
        if(*s!='\\') {
            break;
        }

        if( s[1]!='x' ||
            (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
        ) {
            fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
            return -1;
        }

        if(bLen==UCNV_EXT_MAX_BYTES) {
            fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
            return -1;
        }
        bytes[bLen++]=byte;
        s=end;
    }

    *ps=s;
    return bLen;
}

/* parse a mapping line; must not be empty */
U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping *m,
                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
                     const char *line) {
    const char *s;
    char *end;
    UChar32 cp;
    int32_t u16Length;
    int8_t uLen, bLen, f;

    s=line;
    uLen=bLen=0;

    /* parse code points */
    for(;;) {
        /* skip an optional plus sign */
        if(uLen>0 && *s=='+') {
            ++s;
        }
        if(*s!='<') {
            break;
        }

        if( s[1]!='U' ||
            (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
            *end!='>'
        ) {
            fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
            return FALSE;
        }
        if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
            fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
            return FALSE;
        }

        if(uLen==UCNV_EXT_MAX_UCHARS) {
            fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
            return FALSE;
        }
        codePoints[uLen++]=cp;
        s=end+1;
    }

    if(uLen==0) {
        fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
        return FALSE;
    } else if(uLen==1) {
        m->u=codePoints[0];
    } else {
        UErrorCode errorCode=U_ZERO_ERROR;
        u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
        if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
            u16Length>UCNV_EXT_MAX_UCHARS
        ) {
            fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
            return FALSE;
        }
    }

    s=u_skipWhitespace(s);

    /* parse bytes */
    bLen=ucm_parseBytes(bytes, line, &s);

    if(bLen<0) {
        return FALSE;
    } else if(bLen==0) {
        fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
        return FALSE;
    } else if(bLen<=4) {
        uprv_memcpy(m->b.bytes, bytes, bLen);
    }

    /* skip everything until the fallback indicator, even the start of a comment */
    for(;;) {
        if(*s==0) {
            f=-1; /* no fallback indicator */
            break;
        } else if(*s=='|') {
            f=(int8_t)(s[1]-'0');
            if((uint8_t)f>3) {
                fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
                return FALSE;
            }
            break;
        }
        ++s;
    }

    m->uLen=uLen;
    m->bLen=bLen;
    m->f=f;
    return TRUE;
}

/* general APIs ------------------------------------------------------------- */

U_CAPI UCMTable * U_EXPORT2
ucm_openTable() {
    UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
    if(table==NULL) {
        fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
        exit(U_MEMORY_ALLOCATION_ERROR);
    }

    memset(table, 0, sizeof(UCMTable));
    return table;
}

U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable *table) {
    if(table!=NULL) {
        uprv_free(table->mappings);
        uprv_free(table->codePoints);
        uprv_free(table->bytes);
        uprv_free(table->reverseMap);
        uprv_free(table);
    }
}

U_CAPI void U_EXPORT2
ucm_resetTable(UCMTable *table) {
    if(table!=NULL) {
        table->mappingsLength=0;
        table->flagsType=0;
        table->unicodeMask=0;
        table->bytesLength=table->codePointsLength=0;
        table->isSorted=FALSE;
    }
}

U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable *table,
               UCMapping *m,
               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
               uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    UCMapping *tm;
    UChar32 c;
    int32_t idx;

    if(table->mappingsLength>=table->mappingsCapacity) {
        /* make the mappings array larger */
        if(table->mappingsCapacity==0) {
            table->mappingsCapacity=1000;
        } else {
            table->mappingsCapacity*=10;
        }
        table->mappings=(UCMapping *)uprv_realloc(table->mappings,
                                             table->mappingsCapacity*sizeof(UCMapping));
        if(table->mappings==NULL) {
            fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
                            (int)table->mappingsCapacity);
            exit(U_MEMORY_ALLOCATION_ERROR);
        }

        if(table->reverseMap!=NULL) {
            /* the reverseMap must be reallocated in a new sort */
            uprv_free(table->reverseMap);
            table->reverseMap=NULL;
        }
    }

    if(m->uLen>1 && table->codePointsCapacity==0) {
        table->codePointsCapacity=10000;
        table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
        if(table->codePoints==NULL) {
            fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
                            (int)table->codePointsCapacity);
            exit(U_MEMORY_ALLOCATION_ERROR);
        }
    }

    if(m->bLen>4 && table->bytesCapacity==0) {
        table->bytesCapacity=10000;
        table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
        if(table->bytes==NULL) {
            fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
                            (int)table->bytesCapacity);
            exit(U_MEMORY_ALLOCATION_ERROR);
        }
    }

    if(m->uLen>1) {
        idx=table->codePointsLength;
        table->codePointsLength+=m->uLen;
        if(table->codePointsLength>table->codePointsCapacity) {
            fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
            exit(U_MEMORY_ALLOCATION_ERROR);
        }

        uprv_memcpy(table->codePoints+idx, codePoints, m->uLen*4);
        m->u=idx;
    }

    if(m->bLen>4) {
        idx=table->bytesLength;
        table->bytesLength+=m->bLen;
        if(table->bytesLength>table->bytesCapacity) {
            fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
            exit(U_MEMORY_ALLOCATION_ERROR);
        }

        uprv_memcpy(table->bytes+idx, bytes, m->bLen);
        m->b.idx=idx;
    }

    /* set unicodeMask */
    for(idx=0; idx<m->uLen; ++idx) {
        c=codePoints[idx];
        if(c>=0x10000) {
            table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
        } else if(U_IS_SURROGATE(c)) {
            table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
        }
    }

    /* set flagsType */
    if(m->f<0) {
        table->flagsType|=UCM_FLAGS_IMPLICIT;
    } else {
        table->flagsType|=UCM_FLAGS_EXPLICIT;
    }

    tm=table->mappings+table->mappingsLength++;
    uprv_memcpy(tm, m, sizeof(UCMapping));

    table->isSorted=FALSE;
}

U_CAPI UCMFile * U_EXPORT2
ucm_open() {
    UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
    if(ucm==NULL) {
        fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
        exit(U_MEMORY_ALLOCATION_ERROR);
    }

    memset(ucm, 0, sizeof(UCMFile));

    ucm->base=ucm_openTable();
    ucm->ext=ucm_openTable();

    ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
    ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
    ucm->states.outputType=-1;
    ucm->states.minCharLength=ucm->states.maxCharLength=1;

    return ucm;
}

U_CAPI void U_EXPORT2
ucm_close(UCMFile *ucm) {
    if(ucm!=NULL) {
        uprv_free(ucm->base);
        uprv_free(ucm->ext);
        uprv_free(ucm);
    }
}

U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates *baseStates,
                UCMapping *m,
                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    /* check validity of the bytes and count the characters in them */
    int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
    if(count<1) {
        /* illegal byte sequence */
        return -1;
    }

    /*
     * Suitable for an ICU conversion base table means:
     * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
     * - SBCS: any 1:1 mapping
     *         (the table stores additional bits to distinguish mapping types)
     * - MBCS: not a |2 SUB mapping for <subchar1>
     * - MBCS: not a |1 fallback to 0x00
     * - MBCS: not a multi-byte mapping with leading 0x00 bytes
     *
     * Further restrictions for fromUnicode tables
     * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
     *
     * All of the MBCS fromUnicode specific tests could be removed from here,
     * but the ones above are for unusual mappings, and removing the tests
     * from here would change canonucm output which seems gratuitous.
     * (Markus Scherer 2006-nov-28)
     *
     * Exception: All implicit mappings (f<0) that need to be moved
     * because of fromUnicode restrictions _must_ be moved here because
     * makeconv uses a hack for moving mappings only for the fromUnicode table
     * that only works with non-negative values of f.
     */
    if( m->uLen==1 && count==1 &&
        (baseStates->maxCharLength==1 ||
            !((m->f==2 && m->bLen==1) ||
              (m->f==1 && bytes[0]==0) ||
              (m->f<=1 && m->bLen>1 && bytes[0]==0)))
    ) {
        return 0; /* suitable for a base table */
    } else {
        return 1; /* needs to go into an extension table */
    }
}

U_CAPI UBool U_EXPORT2
ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
                   UCMapping *m,
                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                   uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    int32_t type;

    if(m->f==2 && m->uLen>1) {
        fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
        printMapping(m, codePoints, bytes, stderr);
        return FALSE;
    }

    if(baseStates!=NULL) {
        /* check validity of the bytes and count the characters in them */
        type=ucm_mappingType(baseStates, m, codePoints, bytes);
        if(type<0) {
            /* illegal byte sequence */
            printMapping(m, codePoints, bytes, stderr);
            return FALSE;
        }
    } else {
        /* not used - adding a mapping for an extension-only table before its base table is read */
        type=1;
    }

    /*
     * Add the mapping to the base table if this is requested and suitable.
     * Otherwise, add it to the extension table.
     */
    if(forBase && type==0) {
        ucm_addMapping(ucm->base, m, codePoints, bytes);
    } else {
        ucm_addMapping(ucm->ext, m, codePoints, bytes);
    }

    return TRUE;
}

U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
    UCMapping m={ 0 };
    UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
    uint8_t bytes[UCNV_EXT_MAX_BYTES];

    const char *s;

    /* ignore empty and comment lines */
    if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
        return TRUE;
    }

    return
        ucm_parseMappingLine(&m, codePoints, bytes, line) &&
        ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
}

U_CAPI void U_EXPORT2
ucm_readTable(UCMFile *ucm, FileStream* convFile,
              UBool forBase, UCMStates *baseStates,
              UErrorCode *pErrorCode) {
    char line[500];
    char *end;
    UBool isOK;
    
    if(U_FAILURE(*pErrorCode)) {
        return;
    }

    isOK=TRUE;

    for(;;) {
        /* read the next line */
        if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
            fprintf(stderr, "incomplete charmap section\n");
            isOK=FALSE;
            break;
        }

        /* remove CR LF */
        end=uprv_strchr(line, 0);
        while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
            --end;
        }
        *end=0;

        /* ignore empty and comment lines */
        if(line[0]==0 || line[0]=='#') {
            continue;
        }

        /* stop at the end of the mapping table */
        if(0==uprv_strcmp(line, "END CHARMAP")) {
            break;
        }

        isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
    }

    if(!isOK) {
        *pErrorCode=U_INVALID_TABLE_FORMAT;
    }
}
#endif