rbtok.h   [plain text]


/*
***************************************************************************
* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.                 *
***************************************************************************
*/

#ifndef RBTOK_H
#define RBTOK_H

#include "unicode/utypes.h"

/**
 * \file
 * \brief C++ API: Rule Based Tokenizer
 */

#if !UCONFIG_NO_BREAK_ITERATION

#include "unicode/urbtok.h"
#include "unicode/rbbi.h"
#include "unicode/parseerr.h"


U_NAMESPACE_BEGIN

/** @internal */
struct RBBIDataHeader;
struct RBBIStateTableRow;


/**
 *
 * A subclass of RuleBasedBreakIterator that adds tokenization functionality.

 * <p>This class is for internal use only by Apple Computer, Inc.</p>
 *
 */
class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator {

private:
    /**
     * The row corresponding to the start state
     * @internal
     */
    const RBBIStateTableRow *fStartRow;

    /**
     * The merged flag results for accepting states
     * @internal
     */
    int32_t *fStateFlags;

    /**
     * Character categories for the Latin1 subset of Unicode
     * @internal
     */
    int16_t *fLatin1Cat;

public:
    /**
     * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
     * @param rules The break rules to be used.
     * @param parseError  In the event of a syntax error in the rules, provides the location
     *                    within the rules of the problem.
     * @param status Information on any errors encountered.
     * @internal
     */
    RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status);

    /**
     * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
     *             RulesBasedBreakIterators built from a custom set of rules
     *             are created via this constructor; the rules are compiled
     *             into memory, then the break iterator is constructed here.
     *
     *             The break iterator adopts the memory, and will
     *             free it when done.
     * @internal
     */
    RuleBasedTokenizer(uint8_t *data, UErrorCode &status);

    /**
     * Constructor from a flattened set of RBBI data in umemory which need not
     *             be malloced (e.g. it may be a memory-mapped file, etc.).
       *
     *             This version does not adopt the memory, and does not
     *             free it when done.
     * @internal
     */
    enum EDontAdopt {
        kDontAdopt
    };
    RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status);

    /**
     * Destructor
     *  @internal
     */
    virtual ~RuleBasedTokenizer();

    /**
     * Fetch the next set of tokens.
     * @param maxTokens The maximum number of tokens to return.
     * @param outTokenRanges Pointer to output array of token ranges.
     * @param outTokenFlags (optional) pointer to output array of token flags.
     * @internal
     */
    int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags);

private:
    /**
      * Common initialization function, used by constructors.
      * @internal
      */
    void init();
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

#endif