ucnv_ext.c   [plain text]


/*
******************************************************************************
*
*   Copyright (C) 2003-2004, International Business Machines
*   Corporation and others.  All Rights Reserved.
*
******************************************************************************
*   file name:  ucnv_ext.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2003jun13
*   created by: Markus W. Scherer
*
*   Conversion extensions
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION

#include "unicode/uset.h"
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "ucnv_ext.h"
#include "cmemory.h"

/* to Unicode --------------------------------------------------------------- */

/*
 * @return lookup value for the byte, if found; else 0
 */
static U_INLINE uint32_t
ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
    uint32_t word0, word;
    int32_t i, start, limit;

    /* check the input byte against the lowest and highest section bytes */
    start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
    limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
    if(byte<start || limit<byte) {
        return 0; /* the byte is out of range */
    }

    if(length==((limit-start)+1)) {
        /* direct access on a linear array */
        return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
    }

    /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */
    word0=UCNV_EXT_TO_U_MAKE_WORD(byte, 0);

    /*
     * Shift byte once instead of each section word and add 0xffffff.
     * We will compare the shifted/added byte (bbffffff) against
     * section words which have byte values in the same bit position.
     * If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
     * for all v=0..f
     * so we need not mask off the lower 24 bits of each section word.
     */
    word=word0|UCNV_EXT_TO_U_VALUE_MASK;

    /* binary search */
    start=0;
    limit=length;
    for(;;) {
        i=limit-start;
        if(i<=1) {
            break; /* done */
        }
        /* start<limit-1 */

        if(i<=4) {
            /* linear search for the last part */
            if(word0<=toUSection[start]) {
                break;
            }
            if(++start<limit && word0<=toUSection[start]) {
                break;
            }
            if(++start<limit && word0<=toUSection[start]) {
                break;
            }
            /* always break at start==limit-1 */
            ++start;
            break;
        }

        i=(start+limit)/2;
        if(word<toUSection[i]) {
            limit=i;
        } else {
            start=i;
        }
    }

    /* did we really find it? */
    if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
        return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
    } else {
        return 0; /* not found */
    }
}

/*
 * TRUE if not an SI/SO stateful converter,
 * or if the match length fits with the current converter state
 */
#define UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, match) \
    ((sisoState)<0 || ((sisoState)==0) == (match==1))

/*
 * this works like ucnv_extMatchFromU() except
 * - the first character is in pre
 * - no trie is used
 * - the returned matchLength is not offset by 2
 */
static int32_t
ucnv_extMatchToU(const int32_t *cx, int8_t sisoState,
                 const char *pre, int32_t preLength,
                 const char *src, int32_t srcLength,
                 uint32_t *pMatchValue,
                 UBool useFallback, UBool flush) {
    const uint32_t *toUTable, *toUSection;

    uint32_t value, matchValue;
    int32_t i, j, index, length, matchLength;
    uint8_t b;

    if(cx==NULL || cx[UCNV_EXT_TO_U_LENGTH]<=0) {
        return 0; /* no extension data, no match */
    }

    /* initialize */
    toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
    index=0;

    matchValue=0;
    i=j=matchLength=0;

    if(sisoState==0) {
        /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */
        if(preLength>1) {
            return 0; /* no match of a DBCS sequence in SBCS mode */
        } else if(preLength==1) {
            srcLength=0;
        } else /* preLength==0 */ {
            if(srcLength>1) {
                srcLength=1;
            }
        }
        flush=TRUE;
    }

    /* we must not remember fallback matches when not using fallbacks */

    /* match input units until there is a full match or the input is consumed */
    for(;;) {
        /* go to the next section */
        toUSection=toUTable+index;

        /* read first pair of the section */
        value=*toUSection++;
        length=UCNV_EXT_TO_U_GET_BYTE(value);
        value=UCNV_EXT_TO_U_GET_VALUE(value);
        if( value!=0 &&
            (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
             TO_U_USE_FALLBACK(useFallback)) &&
            UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
        ) {
            /* remember longest match so far */
            matchValue=value;
            matchLength=i+j;
        }

        /* match pre[] then src[] */
        if(i<preLength) {
            b=(uint8_t)pre[i++];
        } else if(j<srcLength) {
            b=(uint8_t)src[j++];
        } else {
            /* all input consumed, partial match */
            if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
                /*
                 * end of the entire input stream, stop with the longest match so far
                 * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
                 * because it must fit into state buffers
                 */
                break;
            } else {
                /* continue with more input next time */
                return -length;
            }
        }

        /* search for the current UChar */
        value=ucnv_extFindToU(toUSection, length, b);
        if(value==0) {
            /* no match here, stop with the longest match so far */
            break;
        } else {
            if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
                /* partial match, continue */
                index=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
            } else {
                if( (UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
                     TO_U_USE_FALLBACK(useFallback)) &&
                    UCNV_EXT_TO_U_VERIFY_SISO_MATCH(sisoState, i+j)
                ) {
                    /* full match, stop with result */
                    matchValue=value;
                    matchLength=i+j;
                } else {
                    /* full match on fallback not taken, stop with the longest match so far */
                }
                break;
            }
        }
    }

    if(matchLength==0) {
        /* no match at all */
        return 0;
    }

    /* return result */
    *pMatchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
    return matchLength;
}

static U_INLINE void
ucnv_extWriteToU(UConverter *cnv, const int32_t *cx,
                 uint32_t value,
                 UChar **target, const UChar *targetLimit,
                 int32_t **offsets, int32_t srcIndex,
                 UErrorCode *pErrorCode) {
    /* output the result */
    if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
        /* output a single code point */
        ucnv_toUWriteCodePoint(
            cnv, UCNV_EXT_TO_U_GET_CODE_POINT(value),
            target, targetLimit,
            offsets, srcIndex,
            pErrorCode);
    } else {
        /* output a string - with correct data we have resultLength>0 */
        ucnv_toUWriteUChars(
            cnv,
            UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+
                UCNV_EXT_TO_U_GET_INDEX(value),
            UCNV_EXT_TO_U_GET_LENGTH(value),
            target, targetLimit,
            offsets, srcIndex,
            pErrorCode);
    }
}

/*
 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS),
 * or 1 for DBCS-only,
 * or -1 if the converter is not SI/SO stateful
 *
 * Note: For SI/SO stateful converters getting here,
 * cnv->mode==0 is equivalent to firstLength==1.
 */
#define UCNV_SISO_STATE(cnv) \
    ((cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_2_SISO ? (int8_t)(cnv)->mode : \
     (cnv)->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? 1 : -1)

/*
 * target<targetLimit; set error code for overflow
 */
U_CFUNC UBool
ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
                        int32_t firstLength,
                        const char **src, const char *srcLimit,
                        UChar **target, const UChar *targetLimit,
                        int32_t **offsets, int32_t srcIndex,
                        UBool flush,
                        UErrorCode *pErrorCode) {
    uint32_t value;
    int32_t match;

    /* try to match */
    match=ucnv_extMatchToU(cx, (int8_t)UCNV_SISO_STATE(cnv),
                           (const char *)cnv->toUBytes, firstLength,
                           *src, (int32_t)(srcLimit-*src),
                           &value,
                           cnv->useFallback, flush);
    if(match>0) {
        /* advance src pointer for the consumed input */
        *src+=match-firstLength;

        /* write result to target */
        ucnv_extWriteToU(cnv, cx,
                         value,
                         target, targetLimit,
                         offsets, srcIndex,
                         pErrorCode);
        return TRUE;
    } else if(match<0) {
        /* save state for partial match */
        const char *s;
        int32_t j;

        /* copy the first code point */
        s=(const char *)cnv->toUBytes;
        cnv->preToUFirstLength=(int8_t)firstLength;
        for(j=0; j<firstLength; ++j) {
            cnv->preToU[j]=*s++;
        }

        /* now copy the newly consumed input */
        s=*src;
        match=-match;
        for(; j<match; ++j) {
            cnv->preToU[j]=*s++;
        }
        *src=s; /* same as *src=srcLimit; because we reached the end of input */
        cnv->preToULength=(int8_t)match;
        return TRUE;
    } else /* match==0 no match */ {
        return FALSE;
    }
}

U_CFUNC UChar32
ucnv_extSimpleMatchToU(const int32_t *cx,
                       const char *source, int32_t length,
                       UBool useFallback) {
    uint32_t value;
    int32_t match;

    if(length<=0) {
        return 0xffff;
    }

    /* try to match */
    match=ucnv_extMatchToU(cx, -1,
                           source, length,
                           NULL, 0,
                           &value,
                           useFallback, TRUE);
    if(match==length) {
        /* write result for simple, single-character conversion */
        if(UCNV_EXT_TO_U_IS_CODE_POINT(value)) {
            return UCNV_EXT_TO_U_GET_CODE_POINT(value);
        }
    }

    /*
     * return no match because
     * - match>0 && value points to string: simple conversion cannot handle multiple code points
     * - match>0 && match!=length: not all input consumed, forbidden for this function
     * - match==0: no match found in the first place
     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
     */
    return 0xfffe;
}

/*
 * continue partial match with new input
 * never called for simple, single-character conversion
 */
U_CFUNC void
ucnv_extContinueMatchToU(UConverter *cnv,
                         UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
                         UErrorCode *pErrorCode) {
    uint32_t value;
    int32_t match, length;

    match=ucnv_extMatchToU(cnv->sharedData->mbcs.extIndexes, (int8_t)UCNV_SISO_STATE(cnv),
                           cnv->preToU, cnv->preToULength,
                           pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
                           &value,
                           cnv->useFallback, pArgs->flush);
    if(match>0) {
        if(match>=cnv->preToULength) {
            /* advance src pointer for the consumed input */
            pArgs->source+=match-cnv->preToULength;
            cnv->preToULength=0;
        } else {
            /* the match did not use all of preToU[] - keep the rest for replay */
            length=cnv->preToULength-match;
            uprv_memmove(cnv->preToU, cnv->preToU+match, length);
            cnv->preToULength=(int8_t)-length;
        }

        /* write result */
        ucnv_extWriteToU(cnv, cnv->sharedData->mbcs.extIndexes,
                         value,
                         &pArgs->target, pArgs->targetLimit,
                         &pArgs->offsets, srcIndex,
                         pErrorCode);
    } else if(match<0) {
        /* save state for partial match */
        const char *s;
        int32_t j;

        /* just _append_ the newly consumed input to preToU[] */
        s=pArgs->source;
        match=-match;
        for(j=cnv->preToULength; j<match; ++j) {
            cnv->preToU[j]=*s++;
        }
        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
        cnv->preToULength=(int8_t)match;
    } else /* match==0 */ {
        /*
         * no match
         *
         * We need to split the previous input into two parts:
         *
         * 1. The first codepage character is unmappable - that's how we got into
         *    trying the extension data in the first place.
         *    We need to move it from the preToU buffer
         *    to the error buffer, set an error code,
         *    and prepare the rest of the previous input for 2.
         *
         * 2. The rest of the previous input must be converted once we
         *    come back from the callback for the first character.
         *    At that time, we have to try again from scratch to convert
         *    these input characters.
         *    The replay will be handled by the ucnv.c conversion code.
         */

        /* move the first codepage character to the error field */
        uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
        cnv->toULength=cnv->preToUFirstLength;

        /* move the rest up inside the buffer */
        length=cnv->preToULength-cnv->preToUFirstLength;
        if(length>0) {
            uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
        }

        /* mark preToU for replay */
        cnv->preToULength=(int8_t)-length;

        /* set the error code for unassigned */
        *pErrorCode=U_INVALID_CHAR_FOUND;
    }
}

/* from Unicode ------------------------------------------------------------- */

/*
 * @return index of the UChar, if found; else <0
 */
static U_INLINE int32_t
ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
    int32_t i, start, limit;

    /* binary search */
    start=0;
    limit=length;
    for(;;) {
        i=limit-start;
        if(i<=1) {
            break; /* done */
        }
        /* start<limit-1 */

        if(i<=4) {
            /* linear search for the last part */
            if(u<=fromUSection[start]) {
                break;
            }
            if(++start<limit && u<=fromUSection[start]) {
                break;
            }
            if(++start<limit && u<=fromUSection[start]) {
                break;
            }
            /* always break at start==limit-1 */
            ++start;
            break;
        }

        i=(start+limit)/2;
        if(u<fromUSection[i]) {
            limit=i;
        } else {
            start=i;
        }
    }

    /* did we really find it? */
    if(start<limit && u==fromUSection[start]) {
        return start;
    } else {
        return -1; /* not found */
    }
}

/*
 * @param cx pointer to extension data; if NULL, returns 0
 * @param firstCP the first code point before all the other UChars
 * @param pre UChars that must match; !initialMatch: partial match with them
 * @param preLength length of pre, >=0
 * @param src UChars that can be used to complete a match
 * @param srcLength length of src, >=0
 * @param pMatchValue [out] output result value for the match from the data structure
 * @param useFallback "use fallback" flag, usually from cnv->useFallback
 * @param flush TRUE if the end of the input stream is reached
 * @return >1: matched, return value=total match length (number of input units matched)
 *          1: matched, no mapping but request for <subchar1>
 *             (only for the first code point)
 *          0: no match
 *         <0: partial match, return value=negative total match length
 *             (partial matches are never returned for flush==TRUE)
 *             (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
 *         the matchLength is 2 if only firstCP matched, and >2 if firstCP and
 *         further code units matched
 */
static int32_t
ucnv_extMatchFromU(const int32_t *cx,
                   UChar32 firstCP,
                   const UChar *pre, int32_t preLength,
                   const UChar *src, int32_t srcLength,
                   uint32_t *pMatchValue,
                   UBool useFallback, UBool flush) {
    const uint16_t *stage12, *stage3;
    const uint32_t *stage3b;

    const UChar *fromUTableUChars, *fromUSectionUChars;
    const uint32_t *fromUTableValues, *fromUSectionValues;

    uint32_t value, matchValue;
    int32_t i, j, index, length, matchLength;
    UChar c;

    if(cx==NULL) {
        return 0; /* no extension data, no match */
    }

    /* trie lookup of firstCP */
    index=firstCP>>10; /* stage 1 index */
    if(index>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
        return 0; /* the first code point is outside the trie */
    }

    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
    index=UCNV_EXT_FROM_U(stage12, stage3, index, firstCP);

    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
    value=stage3b[index];
    if(value==0) {
        return 0;
    }

    if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
        /* partial match, enter the loop below */
        index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);

        /* initialize */
        fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
        fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);

        matchValue=0;
        i=j=matchLength=0;

        /* we must not remember fallback matches when not using fallbacks */

        /* match input units until there is a full match or the input is consumed */
        for(;;) {
            /* go to the next section */
            fromUSectionUChars=fromUTableUChars+index;
            fromUSectionValues=fromUTableValues+index;

            /* read first pair of the section */
            length=*fromUSectionUChars++;
            value=*fromUSectionValues++;
            if( value!=0 &&
                (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
                 FROM_U_USE_FALLBACK(useFallback, firstCP))
            ) {
                /* remember longest match so far */
                matchValue=value;
                matchLength=2+i+j;
            }

            /* match pre[] then src[] */
            if(i<preLength) {
                c=pre[i++];
            } else if(j<srcLength) {
                c=src[j++];
            } else {
                /* all input consumed, partial match */
                if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
                    /*
                     * end of the entire input stream, stop with the longest match so far
                     * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
                     * because it must fit into state buffers
                     */
                    break;
                } else {
                    /* continue with more input next time */
                    return -(2+length);
                }
            }

            /* search for the current UChar */
            index=ucnv_extFindFromU(fromUSectionUChars, length, c);
            if(index<0) {
                /* no match here, stop with the longest match so far */
                break;
            } else {
                value=fromUSectionValues[index];
                if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
                    /* partial match, continue */
                    index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
                } else {
                    if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
                         FROM_U_USE_FALLBACK(useFallback, firstCP)
                    ) {
                        /* full match, stop with result */
                        matchValue=value;
                        matchLength=2+i+j;
                    } else {
                        /* full match on fallback not taken, stop with the longest match so far */
                    }
                    break;
                }
            }
        }

        if(matchLength==0) {
            /* no match at all */
            return 0;
        }
    } else /* result from firstCP trie lookup */ {
        if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
             FROM_U_USE_FALLBACK(useFallback, firstCP)
        ) {
            /* full match, stop with result */
            matchValue=value;
            matchLength=2;
        } else {
            /* fallback not taken */
            return 0;
        }
    }

    if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) {
        /* do not interpret values with reserved bits used, for forward compatibility */
        return 0;
    }

    /* return result */
    if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
        return 1; /* assert matchLength==2 */
    }

    *pMatchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue);
    return matchLength;
}

static U_INLINE void
ucnv_extWriteFromU(UConverter *cnv, const int32_t *cx,
                   uint32_t value,
                   char **target, const char *targetLimit,
                   int32_t **offsets, int32_t srcIndex,
                   UErrorCode *pErrorCode) {
    uint8_t buffer[1+UCNV_EXT_MAX_BYTES];
    const uint8_t *result;
    int32_t length, prevLength;

    length=UCNV_EXT_FROM_U_GET_LENGTH(value);
    value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);

    /* output the result */
    if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
        /*
         * Generate a byte array and then write it below.
         * This is not the fastest possible way, but it should be ok for
         * extension mappings, and it is much simpler.
         * Offset and overflow handling are only done once this way.
         */
        uint8_t *p=buffer+1; /* reserve buffer[0] for shiftByte below */
        switch(length) {
        case 3:
            *p++=(uint8_t)(value>>16);
        case 2:
            *p++=(uint8_t)(value>>8);
        case 1:
            *p++=(uint8_t)value;
        default:
            break; /* will never occur */
        }
        result=buffer+1;
    } else {
        result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
    }

    /* with correct data we have length>0 */

    if((prevLength=cnv->fromUnicodeStatus)!=0) {
        /* handle SI/SO stateful output */
        uint8_t shiftByte;

        if(prevLength>1 && length==1) {
            /* change from double-byte mode to single-byte */
            shiftByte=(uint8_t)UCNV_SI;
            cnv->fromUnicodeStatus=1;
        } else if(prevLength==1 && length>1) {
            /* change from single-byte mode to double-byte */
            shiftByte=(uint8_t)UCNV_SO;
            cnv->fromUnicodeStatus=2;
        } else {
            shiftByte=0;
        }

        if(shiftByte!=0) {
            /* prepend the shift byte to the result bytes */
            buffer[0]=shiftByte;
            if(result!=buffer+1) {
                uprv_memcpy(buffer+1, result, length);
            }
            result=buffer;
            ++length;
        }
    }

    ucnv_fromUWriteBytes(cnv, (const char *)result, length,
                         target, targetLimit,
                         offsets, srcIndex,
                         pErrorCode);
}

/*
 * target<targetLimit; set error code for overflow
 */
U_CFUNC UBool
ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
                          UChar32 cp,
                          const UChar **src, const UChar *srcLimit,
                          char **target, const char *targetLimit,
                          int32_t **offsets, int32_t srcIndex,
                          UBool flush,
                          UErrorCode *pErrorCode) {
    uint32_t value;
    int32_t match;

    /* try to match */
    match=ucnv_extMatchFromU(cx, cp,
                             NULL, 0,
                             *src, (int32_t)(srcLimit-*src),
                             &value,
                             cnv->useFallback, flush);

    /* reject a match if the result is a single byte for DBCS-only */
    if( match>=2 &&
        !(UCNV_EXT_FROM_U_GET_LENGTH(value)==1 &&
          cnv->sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY)
    ) {
        /* advance src pointer for the consumed input */
        *src+=match-2; /* remove 2 for the initial code point */

        /* write result to target */
        ucnv_extWriteFromU(cnv, cx,
                           value,
                           target, targetLimit,
                           offsets, srcIndex,
                           pErrorCode);
        return TRUE;
    } else if(match<0) {
        /* save state for partial match */
        const UChar *s;
        int32_t j;

        /* copy the first code point */
        cnv->preFromUFirstCP=cp;

        /* now copy the newly consumed input */
        s=*src;
        match=-match-2; /* remove 2 for the initial code point */
        for(j=0; j<match; ++j) {
            cnv->preFromU[j]=*s++;
        }
        *src=s; /* same as *src=srcLimit; because we reached the end of input */
        cnv->preFromULength=(int8_t)match;
        return TRUE;
    } else if(match==1) {
        /* matched, no mapping but request for <subchar1> */
        cnv->useSubChar1=TRUE;
        return FALSE;
    } else /* match==0 no match */ {
        return FALSE;
    }
}

U_CFUNC int32_t
ucnv_extSimpleMatchFromU(const int32_t *cx,
                         UChar32 cp, uint32_t *pValue,
                         UBool useFallback) {
    uint32_t value;
    int32_t match;

    /* try to match */
    match=ucnv_extMatchFromU(cx,
                             cp,
                             NULL, 0,
                             NULL, 0,
                             &value,
                             useFallback, TRUE);
    if(match>=2) {
        /* write result for simple, single-character conversion */
        int32_t length;
        
        length=UCNV_EXT_FROM_U_GET_LENGTH(value);
        value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(value);

        if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
            *pValue=value;
            return length;
#if 0 /* not currently used */
        } else if(length==4) {
            /* de-serialize a 4-byte result */
            const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value;
            *pValue=
                ((uint32_t)result[0]<<24)|
                ((uint32_t)result[1]<<16)|
                ((uint32_t)result[2]<<8)|
                result[3];
            return 4;
#endif
        }
    }

    /*
     * return no match because
     * - match>1 && resultLength>4: result too long for simple conversion
     * - match==1: no match found, <subchar1> preferred
     * - match==0: no match found in the first place
     * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
     */
    return 0;
}

/*
 * continue partial match with new input, requires cnv->preFromUFirstCP>=0
 * never called for simple, single-character conversion
 */
U_CFUNC void
ucnv_extContinueMatchFromU(UConverter *cnv,
                           UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
                           UErrorCode *pErrorCode) {
    uint32_t value;
    int32_t match;

    match=ucnv_extMatchFromU(cnv->sharedData->mbcs.extIndexes,
                             cnv->preFromUFirstCP,
                             cnv->preFromU, cnv->preFromULength,
                             pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
                             &value,
                             cnv->useFallback, pArgs->flush);
    if(match>=2) {
        match-=2; /* remove 2 for the initial code point */

        if(match>=cnv->preFromULength) {
            /* advance src pointer for the consumed input */
            pArgs->source+=match-cnv->preFromULength;
            cnv->preFromULength=0;
        } else {
            /* the match did not use all of preFromU[] - keep the rest for replay */
            int32_t length=cnv->preFromULength-match;
            uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
            cnv->preFromULength=(int8_t)-length;
        }

        /* finish the partial match */
        cnv->preFromUFirstCP=U_SENTINEL;

        /* write result */
        ucnv_extWriteFromU(cnv, cnv->sharedData->mbcs.extIndexes,
                           value,
                           &pArgs->target, pArgs->targetLimit,
                           &pArgs->offsets, srcIndex,
                           pErrorCode);
    } else if(match<0) {
        /* save state for partial match */
        const UChar *s;
        int32_t j;

        /* just _append_ the newly consumed input to preFromU[] */
        s=pArgs->source;
        match=-match-2; /* remove 2 for the initial code point */
        for(j=cnv->preFromULength; j<match; ++j) {
            cnv->preFromU[j]=*s++;
        }
        pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
        cnv->preFromULength=(int8_t)match;
    } else /* match==0 or 1 */ {
        /*
         * no match
         *
         * We need to split the previous input into two parts:
         *
         * 1. The first code point is unmappable - that's how we got into
         *    trying the extension data in the first place.
         *    We need to move it from the preFromU buffer
         *    to the error buffer, set an error code,
         *    and prepare the rest of the previous input for 2.
         *
         * 2. The rest of the previous input must be converted once we
         *    come back from the callback for the first code point.
         *    At that time, we have to try again from scratch to convert
         *    these input characters.
         *    The replay will be handled by the ucnv.c conversion code.
         */

        if(match==1) {
            /* matched, no mapping but request for <subchar1> */
            cnv->useSubChar1=TRUE;
        }

        /* move the first code point to the error field */
        cnv->fromUChar32=cnv->preFromUFirstCP;
        cnv->preFromUFirstCP=U_SENTINEL;

        /* mark preFromU for replay */
        cnv->preFromULength=-cnv->preFromULength;

        /* set the error code for unassigned */
        *pErrorCode=U_INVALID_CHAR_FOUND;
    }
}

static void
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
                            const int32_t *cx,
                            USetAdder *sa,
                            UConverterUnicodeSet which,
                            int32_t minLength,
                            UChar32 c,
                            UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
                            int32_t sectionIndex,
                            UErrorCode *pErrorCode) {
    const UChar *fromUSectionUChars;
    const uint32_t *fromUSectionValues;

    uint32_t value;
    int32_t i, count;

    fromUSectionUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar)+sectionIndex;
    fromUSectionValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t)+sectionIndex;

    /* read first pair of the section */
    count=*fromUSectionUChars++;
    value=*fromUSectionValues++;

    if( value!=0 &&
        UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) &&
        UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
    ) {
        if(c>=0) {
            /* add the initial code point */
            sa->add(sa->set, c);
        } else {
            /* add the string so far */
            sa->addString(sa->set, s, length);
        }
    }

    for(i=0; i<count; ++i) {
        /* append this code unit and recurse or add the string */
        s[length]=fromUSectionUChars[i];
        value=fromUSectionValues[i];

        if(value==0) {
            /* no mapping, do nothing */
        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
            ucnv_extGetUnicodeSetString(
                sharedData, cx, sa, which, minLength,
                U_SENTINEL, s, length+1,
                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
                pErrorCode);
        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
                  UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
        ) {
            sa->addString(sa->set, s, length+1);
        }
    }
}

U_CFUNC void
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
                      USetAdder *sa,
                      UConverterUnicodeSet which,
                      UErrorCode *pErrorCode) {
    const int32_t *cx;
    const uint16_t *stage12, *stage3, *ps2, *ps3;
    const uint32_t *stage3b;

    uint32_t value;
    int32_t st1, stage1Length, st2, st3, minLength;

    UChar s[UCNV_EXT_MAX_UCHARS];
    UChar32 c;
    int32_t length;

    cx=sharedData->mbcs.extIndexes;
    if(cx==NULL) {
        return;
    }

    stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
    stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
    stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);

    stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];

    /* enumerate the from-Unicode trie table */
    c=0; /* keep track of the current code point while enumerating */

    if(sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY) {
        /* DBCS-only, ignore single-byte results */
        minLength=2;
    } else {
        minLength=1;
    }

    /*
     * the trie enumeration is almost the same as
     * in MBCSGetUnicodeSet() for MBCS_OUTPUT_1
     */
    for(st1=0; st1<stage1Length; ++st1) {
        st2=stage12[st1];
        if(st2>stage1Length) {
            ps2=stage12+st2;
            for(st2=0; st2<64; ++st2) {
                if((st3=(int32_t)ps2[st2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)!=0) {
                    /* read the stage 3 block */
                    ps3=stage3+st3;

                    /*
                     * Add code points for which the roundtrip flag is set.
                     * Do not add <subchar1> entries or other (future?) pseudo-entries
                     * with an output length of 0, or entries with reserved bits set.
                     * Recurse for partial results.
                     */
                    do {
                        value=stage3b[*ps3++];
                        if(value==0) {
                            /* no mapping, do nothing */
                        } else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
                            length=0;
                            U16_APPEND_UNSAFE(s, length, c);
                            ucnv_extGetUnicodeSetString(
                                sharedData, cx, sa, which, minLength,
                                c, s, length,
                                (int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
                                pErrorCode);
                        } else if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
                                           UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) &&
                                  UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
                        ) {
                            sa->add(sa->set, c);
                        }
                    } while((++c&0xf)!=0);
                } else {
                    c+=16; /* empty stage 3 block */
                }
            }
        } else {
            c+=1024; /* empty stage 2 block */
        }
    }
}

#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */