dictbe.h   [plain text]


/**
 *******************************************************************************
 * Copyright (C) 2006,2011, International Business Machines Corporation        *
 * and others. All Rights Reserved.                                            *
 *******************************************************************************
 */

#ifndef DICTBE_H
#define DICTBE_H

#include "unicode/utypes.h"
#include "unicode/uniset.h"
#include "unicode/utext.h"

#include "brkeng.h"

U_NAMESPACE_BEGIN

class TrieWordDictionary;

/*******************************************************************
 * DictionaryBreakEngine
 */

/**
 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
 * dictionary to determine language-specific breaks.</p>
 *
 * <p>After it is constructed a DictionaryBreakEngine may be shared between
 * threads without synchronization.</p>
 */
class DictionaryBreakEngine : public LanguageBreakEngine {
 private:
    /**
     * The set of characters handled by this engine
     * @internal
     */

  UnicodeSet    fSet;

    /**
     * The set of break types handled by this engine
     * @internal
     */

  uint32_t      fTypes;

  /**
   * <p>Default constructor.</p>
   *
   */
  DictionaryBreakEngine();

 public:

  /**
   * <p>Constructor setting the break types handled.</p>
   *
   * @param breakTypes A bitmap of types handled by the engine.
   */
  DictionaryBreakEngine( uint32_t breakTypes );

  /**
   * <p>Virtual destructor.</p>
   */
  virtual ~DictionaryBreakEngine();

 /**
  * <p>Indicate whether this engine handles a particular character for
  * a particular kind of break.</p>
  *
  * @param c A character which begins a run that the engine might handle
  * @param breakType The type of text break which the caller wants to determine
  * @return TRUE if this engine handles the particular character and break
  * type.
  */
  virtual UBool handles( UChar32 c, int32_t breakType ) const;

 /**
  * <p>Find any breaks within a run in the supplied text.</p>
  *
  * @param text A UText representing the text. The
  * iterator is left at the end of the run of characters which the engine
  * is capable of handling.
  * @param startPos The start of the run within the supplied text.
  * @param endPos The end of the run within the supplied text.
  * @param reverse Whether the caller is looking for breaks in a reverse
  * direction.
  * @param breakType The type of break desired, or -1.
  * @param foundBreaks An allocated C array of the breaks found, if any
  * @return The number of breaks found.
  */
  virtual int32_t findBreaks( UText *text,
                              int32_t startPos,
                              int32_t endPos,
                              UBool reverse,
                              int32_t breakType,
                              UStack &foundBreaks ) const;

 protected:

 /**
  * <p>Set the character set handled by this engine.</p>
  *
  * @param set A UnicodeSet of the set of characters handled by the engine
  */
  virtual void setCharacters( const UnicodeSet &set );

 /**
  * <p>Set the break types handled by this engine.</p>
  *
  * @param breakTypes A bitmap of types handled by the engine.
  */
//  virtual void setBreakTypes( uint32_t breakTypes );

 /**
  * <p>Divide up a range of known dictionary characters.</p>
  *
  * @param text A UText representing the text
  * @param rangeStart The start of the range of dictionary characters
  * @param rangeEnd The end of the range of dictionary characters
  * @param foundBreaks Output of C array of int32_t break positions, or 0
  * @return The number of breaks found
  */
  virtual int32_t divideUpDictionaryRange( UText *text,
                                           int32_t rangeStart,
                                           int32_t rangeEnd,
                                           UStack &foundBreaks ) const = 0;

};

/*******************************************************************
 * ThaiBreakEngine
 */

/**
 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
 * TrieWordDictionary and heuristics to determine Thai-specific breaks.</p>
 *
 * <p>After it is constructed a ThaiBreakEngine may be shared between
 * threads without synchronization.</p>
 */
class ThaiBreakEngine : public DictionaryBreakEngine {
 private:
    /**
     * The set of characters handled by this engine
     * @internal
     */

  UnicodeSet                fThaiWordSet;
  UnicodeSet                fEndWordSet;
  UnicodeSet                fBeginWordSet;
  UnicodeSet                fSuffixSet;
  UnicodeSet                fMarkSet;
  const TrieWordDictionary  *fDictionary;

 public:

  /**
   * <p>Default constructor.</p>
   *
   * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the
   * engine is deleted.
   */
  ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status);

  /**
   * <p>Virtual destructor.</p>
   */
  virtual ~ThaiBreakEngine();

 protected:
 /**
  * <p>Divide up a range of known dictionary characters.</p>
  *
  * @param text A UText representing the text
  * @param rangeStart The start of the range of dictionary characters
  * @param rangeEnd The end of the range of dictionary characters
  * @param foundBreaks Output of C array of int32_t break positions, or 0
  * @return The number of breaks found
  */
  virtual int32_t divideUpDictionaryRange( UText *text,
                                           int32_t rangeStart,
                                           int32_t rangeEnd,
                                           UStack &foundBreaks ) const;

};


/******************************************************************* 
 * KhmerBreakEngine 
 */ 
 
/** 
 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a 
 * TrieWordDictionary and heuristics to determine Khmer-specific breaks.</p> 
 * 
 * <p>After it is constructed a KhmerBreakEngine may be shared between 
 * threads without synchronization.</p> 
 */ 
class KhmerBreakEngine : public DictionaryBreakEngine { 
 private: 
    /** 
     * The set of characters handled by this engine 
     * @internal 
     */ 
 
  UnicodeSet                fKhmerWordSet; 
  UnicodeSet                fEndWordSet; 
  UnicodeSet                fBeginWordSet; 
  UnicodeSet                fMarkSet; 
  const TrieWordDictionary  *fDictionary; 
 
 public: 
 
  /** 
   * <p>Default constructor.</p> 
   * 
   * @param adoptDictionary A TrieWordDictionary to adopt. Deleted when the 
   * engine is deleted. 
   */ 
  KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status); 
 
  /** 
   * <p>Virtual destructor.</p> 
   */ 
  virtual ~KhmerBreakEngine(); 
 
 protected: 
 /** 
  * <p>Divide up a range of known dictionary characters.</p> 
  * 
  * @param text A UText representing the text 
  * @param rangeStart The start of the range of dictionary characters 
  * @param rangeEnd The end of the range of dictionary characters 
  * @param foundBreaks Output of C array of int32_t break positions, or 0 
  * @return The number of breaks found 
  */ 
  virtual int32_t divideUpDictionaryRange( UText *text, 
                                           int32_t rangeStart, 
                                           int32_t rangeEnd, 
                                           UStack &foundBreaks ) const; 
 
}; 
 
 
U_NAMESPACE_END

    /* DICTBE_H */
#endif