csdetect.cpp   [plain text]


/*
 **********************************************************************
 *   Copyright (C) 2005-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "unicode/ucsdet.h"

#include "csdetect.h"
#include "csmatch.h"
#include "uenumimp.h"

#include "cmemory.h"
#include "cstring.h"
#include "umutex.h"
#include "ucln_in.h"
#include "uarrsort.h"
#include "inputext.h"
#include "csrsbcs.h"
#include "csrmbcs.h"
#include "csrutf8.h"
#include "csrucode.h"
#include "csr2022.h"

#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))

U_NAMESPACE_BEGIN

struct CSRecognizerInfo : public UMemory {
    CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
        : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};

    ~CSRecognizerInfo() {delete recognizer;};

    CharsetRecognizer *recognizer;
    UBool isDefaultEnabled;
};

U_NAMESPACE_END

static icu::CSRecognizerInfo **fCSRecognizers = NULL;
static icu::UInitOnce gCSRecognizersInitOnce;
static int32_t fCSRecognizers_size = 0;

U_CDECL_BEGIN
static UBool U_CALLCONV csdet_cleanup(void)
{
    U_NAMESPACE_USE
    if (fCSRecognizers != NULL) {
        for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
            delete fCSRecognizers[r];
            fCSRecognizers[r] = NULL;
        }

        DELETE_ARRAY(fCSRecognizers);
        fCSRecognizers = NULL;
        fCSRecognizers_size = 0;
    }
    gCSRecognizersInitOnce.reset();

    return TRUE;
}

static int32_t U_CALLCONV
charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
{
    U_NAMESPACE_USE

    const CharsetMatch **csm_l = (const CharsetMatch **) left;
    const CharsetMatch **csm_r = (const CharsetMatch **) right;

    // NOTE: compare is backwards to sort from highest to lowest.
    return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
}

static void U_CALLCONV initRecognizers(UErrorCode &status) {
    U_NAMESPACE_USE
    ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
    CSRecognizerInfo *tempArray[] = {
        new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),

        new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),

        new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),

        new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
#if !UCONFIG_ONLY_HTML_CONVERSION
        new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
        new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),

        new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
        new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
        new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
#endif
    };
    int32_t rCount = UPRV_LENGTHOF(tempArray);

    fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);

    if (fCSRecognizers == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    } 
    else {
        fCSRecognizers_size = rCount;
        for (int32_t r = 0; r < rCount; r += 1) {
            fCSRecognizers[r] = tempArray[r];
            if (fCSRecognizers[r] == NULL) {
                status = U_MEMORY_ALLOCATION_ERROR;
            }
        }
    }
}

U_CDECL_END

U_NAMESPACE_BEGIN

void CharsetDetector::setRecognizers(UErrorCode &status)
{
    umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
}

CharsetDetector::CharsetDetector(UErrorCode &status)
  : textIn(new InputText(status)), resultArray(NULL),
    resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
    fEnabledRecognizers(NULL)
{
    if (U_FAILURE(status)) {
        return;
    }

    setRecognizers(status);

    if (U_FAILURE(status)) {
        return;
    }

    resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);

    if (resultArray == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
        resultArray[i] = new CharsetMatch();

        if (resultArray[i] == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            break;
        }
    }
}

CharsetDetector::~CharsetDetector()
{
    delete textIn;

    for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
        delete resultArray[i];
    }

    uprv_free(resultArray);

    if (fEnabledRecognizers) {
        uprv_free(fEnabledRecognizers);
    }
}

void CharsetDetector::setText(const char *in, int32_t len)
{
    textIn->setText(in, len);
    fFreshTextSet = TRUE;
}

UBool CharsetDetector::setStripTagsFlag(UBool flag)
{
    UBool temp = fStripTags;
    fStripTags = flag;
    fFreshTextSet = TRUE;
    return temp;
}

UBool CharsetDetector::getStripTagsFlag() const
{
    return fStripTags;
}

void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
{
    textIn->setDeclaredEncoding(encoding,len);
}

int32_t CharsetDetector::getDetectableCount()
{
    UErrorCode status = U_ZERO_ERROR;

    setRecognizers(status);

    return fCSRecognizers_size; 
}

const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
{
    int32_t maxMatchesFound = 0;

    detectAll(maxMatchesFound, status);

    if(maxMatchesFound > 0) {
        return resultArray[0];
    } else {
        return NULL;
    }
}

const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
{
    if(!textIn->isSet()) {
        status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set

        return NULL;
    } else if (fFreshTextSet) {
        CharsetRecognizer *csr;
        int32_t            i;

        textIn->MungeInput(fStripTags);

        // Iterate over all possible charsets, remember all that
        // give a match quality > 0.
        resultCount = 0;
        for (i = 0; i < fCSRecognizers_size; i += 1) {
            csr = fCSRecognizers[i]->recognizer;
            if (csr->match(textIn, resultArray[resultCount])) {
                resultCount++;
            }
        }

        if (resultCount > 1) {
            uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
        }
        fFreshTextSet = FALSE;
    }

    maxMatchesFound = resultCount;

    return resultArray;
}

void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
{
    if (U_FAILURE(status)) {
        return;
    }

    int32_t modIdx = -1;
    UBool isDefaultVal = FALSE;
    for (int32_t i = 0; i < fCSRecognizers_size; i++) {
        CSRecognizerInfo *csrinfo = fCSRecognizers[i];
        if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
            modIdx = i;
            isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
            break;
        }
    }
    if (modIdx < 0) {
        // No matching encoding found
        status = U_ILLEGAL_ARGUMENT_ERROR;
        return;
    }

    if (fEnabledRecognizers == NULL && !isDefaultVal) {
        // Create an array storing the non default setting
        fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
        if (fEnabledRecognizers == NULL) {
            status = U_MEMORY_ALLOCATION_ERROR;
            return;
        }
        // Initialize the array with default info
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
            fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
        }
    }

    if (fEnabledRecognizers != NULL) {
        fEnabledRecognizers[modIdx] = enabled;
    }
}

/*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
{
    if( index > fCSRecognizers_size-1 || index < 0) {
        status = U_INDEX_OUTOFBOUNDS_ERROR;

        return 0;
    } else {
        return fCSRecognizers[index]->getName();
    }
}*/

U_NAMESPACE_END

U_CDECL_BEGIN
typedef struct {
    int32_t currIndex;
    UBool all;
    UBool *enabledRecognizers;
} Context;



static void U_CALLCONV
enumClose(UEnumeration *en) {
    if(en->context != NULL) {
        DELETE_ARRAY(en->context);
    }

    DELETE_ARRAY(en);
}

static int32_t U_CALLCONV
enumCount(UEnumeration *en, UErrorCode *) {
    if (((Context *)en->context)->all) {
        // ucsdet_getAllDetectableCharsets, all charset detector names
        return fCSRecognizers_size;
    }

    // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
    int32_t count = 0;
    UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
    if (enabledArray != NULL) {
        // custom set
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
            if (enabledArray[i]) {
                count++;
            }
        }
    } else {
        // default set
        for (int32_t i = 0; i < fCSRecognizers_size; i++) {
            if (fCSRecognizers[i]->isDefaultEnabled) {
                count++;
            }
        }
    }
    return count;
}

static const char* U_CALLCONV
enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
    const char *currName = NULL;

    if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
        if (((Context *)en->context)->all) {
            // ucsdet_getAllDetectableCharsets, all charset detector names
            currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
            ((Context *)en->context)->currIndex++;
        } else {
            // ucsdet_getDetectableCharsets
            UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
            if (enabledArray != NULL) {
                // custome set
                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
                    if (enabledArray[((Context *)en->context)->currIndex]) {
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
                    }
                    ((Context *)en->context)->currIndex++;
                }
            } else {
                // default set
                while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
                    if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
                        currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
                    }
                    ((Context *)en->context)->currIndex++;
                }
            }
        }
    }

    if(resultLength != NULL) {
        *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
    }

    return currName;
}


static void U_CALLCONV
enumReset(UEnumeration *en, UErrorCode *) {
    ((Context *)en->context)->currIndex = 0;
}

static const UEnumeration gCSDetEnumeration = {
    NULL,
    NULL,
    enumClose,
    enumCount,
    uenum_unextDefault,
    enumNext,
    enumReset
};

U_CDECL_END

U_NAMESPACE_BEGIN

UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
{

    /* Initialize recognized charsets. */
    setRecognizers(status);

    if(U_FAILURE(status)) {
        return 0;
    }

    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    if (en == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return 0;
    }
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    en->context = (void*)NEW_ARRAY(Context, 1);
    if (en->context == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        DELETE_ARRAY(en);
        return 0;
    }
    uprv_memset(en->context, 0, sizeof(Context));
    ((Context*)en->context)->all = TRUE;
    return en;
}

UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
{
    if(U_FAILURE(status)) {
        return 0;
    }

    UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    if (en == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return 0;
    }
    memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    en->context = (void*)NEW_ARRAY(Context, 1);
    if (en->context == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        DELETE_ARRAY(en);
        return 0;
    }
    uprv_memset(en->context, 0, sizeof(Context));
    ((Context*)en->context)->all = FALSE;
    ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
    return en;
}

U_NAMESPACE_END

#endif