csrsbcs.h   [plain text]


/*
 **********************************************************************
 *   Copyright (C) 2005-2009, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#ifndef __CSRSBCS_H
#define __CSRSBCS_H

#include "unicode/uobject.h"

#if !UCONFIG_NO_CONVERSION

#include "csrecog.h"

U_NAMESPACE_BEGIN

class NGramParser : public UMemory
{
private:
    int32_t byteIndex;
    int32_t ngram;

    const int32_t *ngramList;
    const uint8_t *charMap;

    int32_t ngramCount;
    int32_t hitCount;

public:
    NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);

private:
    /*
    * Binary search for value in table, which must have exactly 64 entries.
    */
    int32_t search(const int32_t *table, int32_t value);

    void lookup(int32_t thisNgram);
    void addByte(int32_t b);
    int32_t nextByte(InputText *det);

public:
    int32_t parse(InputText *det);

};

class CharsetRecog_sbcs : public CharsetRecognizer
{
protected:
    UBool haveC1Bytes;

public:
    CharsetRecog_sbcs();

    virtual ~CharsetRecog_sbcs();

    virtual const char *getName() const = 0;

    virtual int32_t match(InputText *det) = 0;

    int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]);
};

class CharsetRecog_8859_1 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_8859_1();

    const char *getName() const;
};

class CharsetRecog_8859_2 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_8859_2();

    const char *getName() const;
};

class CharsetRecog_8859_5 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_8859_5();

    const char *getName() const;
};

class CharsetRecog_8859_6 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_8859_6();

    const char *getName() const;
};

class CharsetRecog_8859_7 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_8859_7();

    const char *getName() const;
};

class CharsetRecog_8859_8 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_8859_8();
	
    virtual const char *getName() const;
};

class CharsetRecog_8859_9 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_8859_9();

    const char *getName() const;
};

class CharsetRecog_8859_1_en : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_en();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_da : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_da();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_de : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_de();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_es : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_es();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_fr : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_fr();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_it : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_it();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_nl : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_nl();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_no : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_no();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_pt : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_pt();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_1_sv : public CharsetRecog_8859_1
{
public:
    virtual ~CharsetRecog_8859_1_sv();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_2_cs : public CharsetRecog_8859_2
{
public:
    virtual ~CharsetRecog_8859_2_cs();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_2_hu : public CharsetRecog_8859_2
{
public:
    virtual ~CharsetRecog_8859_2_hu();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_2_pl : public CharsetRecog_8859_2
{
public:
    virtual ~CharsetRecog_8859_2_pl();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_2_ro : public CharsetRecog_8859_2
{
public:
    virtual ~CharsetRecog_8859_2_ro();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
{
public:
    virtual ~CharsetRecog_8859_5_ru();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
{
public:
    virtual ~CharsetRecog_8859_6_ar();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
{
public:
    virtual ~CharsetRecog_8859_7_el();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
{
public:
    virtual ~CharsetRecog_8859_8_I_he();
	
    const char *getName() const;

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
{
public:
    virtual ~CharsetRecog_8859_8_he ();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
{
public:
    virtual ~CharsetRecog_8859_9_tr ();

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_windows_1256();

    const char *getName() const;

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_windows_1251();

    const char *getName() const;

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};


class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_KOI8_R();

    const char *getName() const;

    const char *getLanguage() const;

    int32_t match(InputText *textIn);
};

class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_IBM424_he();

    const char *getLanguage() const;
};

class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
public:
    virtual ~CharsetRecog_IBM424_he_rtl();
    
    const char *getName() const;
    
    int32_t match(InputText *textIn);
};

class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
    virtual ~CharsetRecog_IBM424_he_ltr();
    
    const char *getName() const;
    
    int32_t match(InputText *textIn);
};

class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
{
public:
    virtual ~CharsetRecog_IBM420_ar();

    const char *getLanguage() const;
    
protected:
    void matchInit(InputText *textIn);
    void matchFinish(InputText *textIn);
    
private:
    uint8_t *prev_fInputBytes;
    int32_t prev_fInputBytesLength;
    UBool deleteBuffer;
    
    UBool isLamAlef(uint8_t b);
    uint8_t *unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
    uint8_t *unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length);
};

class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
public:
    virtual ~CharsetRecog_IBM420_ar_rtl();
    
    const char *getName() const;
    
    int32_t match(InputText *textIn);
};

class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
    virtual ~CharsetRecog_IBM420_ar_ltr();
    
    const char *getName() const;
    
    int32_t match(InputText *textIn);
};

U_NAMESPACE_END

#endif
#endif /* __CSRSBCS_H */