tridpars.h [plain text]

/*
 **************************************************************************
 *   Copyright (c) 2002-2004, International Business Machines Corporation *
 *   and others.  All Rights Reserved.                                    *
 **************************************************************************
 *   Date        Name        Description                                  *
 *   01/28/2002  aliu        Creation.                                    *
 **************************************************************************
 */
#ifndef TRIDPARS_H
#define TRIDPARS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uobject.h"
#include "unicode/unistr.h"

U_NAMESPACE_BEGIN

class Transliterator;
class UnicodeSet;
class UVector;

/**
 * Parsing component for transliterator IDs.  This class contains only
 * static members; it cannot be instantiated.  Methods in this class
 * parse various ID formats, including the following:
 *
 * A basic ID, which contains source, target, and variant, but no
 * filter and no explicit inverse.  Examples include
 * "Latin-Greek/UNGEGN" and "Null".
 *
 * A single ID, which is a basic ID plus optional filter and optional
 * explicit inverse.  Examples include "[a-zA-Z] Latin-Greek" and
 * "Lower (Upper)".
 *
 * A compound ID, which is a sequence of one or more single IDs,
 * separated by semicolons, with optional forward and reverse global
 * filters.  The global filters are UnicodeSet patterns prepended or
 * appended to the IDs, separated by semicolons.  An appended filter
 * must be enclosed in parentheses and applies in the reverse
 * direction.
 *
 * @author Alan Liu
 */
class TransliteratorIDParser /* not : public UObject because all methods are static */ {

 public:

    /**
     * A structure containing the parsed data of a filtered ID, that
     * is, a basic ID optionally with a filter.
     *
     * 'source' and 'target' will always be non-null.  The 'variant'
     * will be non-null only if a non-empty variant was parsed.
     *
     * 'sawSource' is true if there was an explicit source in the
     * parsed id.  If there was no explicit source, then an implied
     * source of ANY is returned and 'sawSource' is set to false.
     * 
     * 'filter' is the parsed filter pattern, or null if there was no
     * filter.
     */
    class Specs : public UMemory {
    public:
        UnicodeString source; // not null
        UnicodeString target; // not null
        UnicodeString variant; // may be null
        UnicodeString filter; // may be null
        UBool sawSource;
        Specs(const UnicodeString& s, const UnicodeString& t,
              const UnicodeString& v, UBool sawS,
              const UnicodeString& f);

    private:

        Specs(const Specs &other); // forbid copying of this class
        Specs &operator=(const Specs &other); // forbid copying of this class
    };

    /**
     * A structure containing the canonicalized data of a filtered ID,
     * that is, a basic ID optionally with a filter.
     *
     * 'canonID' is always non-null.  It may be the empty string "".
     * It is the id that should be assigned to the created
     * transliterator.  It _cannot_ be instantiated directly.
     *
     * 'basicID' is always non-null and non-empty.  It is always of
     * the form S-T or S-T/V.  It is designed to be fed to low-level
     * instantiation code that only understands these two formats.
     *
     * 'filter' may be null, if there is none, or non-null and
     * non-empty.
     */
    class SingleID : public UMemory {
    public:
        UnicodeString canonID;
        UnicodeString basicID;
        UnicodeString filter;
        SingleID(const UnicodeString& c, const UnicodeString& b,
                 const UnicodeString& f);
        SingleID(const UnicodeString& c, const UnicodeString& b);
        Transliterator* createInstance();

    private:

        SingleID(const SingleID &other); // forbid copying of this class
        SingleID &operator=(const SingleID &other); // forbid copying of this class
    };

    /**
     * Parse a filter ID, that is, an ID of the general form
     * "[f1] s1-t1/v1", with the filters optional, and the variants optional.
     * @param id the id to be parsed
     * @param pos INPUT-OUTPUT parameter.  On input, the position of
     * the first character to parse.  On output, the position after
     * the last character parsed.
     * @return a SingleID object or null if the parse fails
     */
    static SingleID* parseFilterID(const UnicodeString& id, int32_t& pos);

    /**
     * Parse a single ID, that is, an ID of the general form
     * "[f1] s1-t1/v1 ([f2] s2-t3/v2)", with the parenthesized element
     * optional, the filters optional, and the variants optional.
     * @param id the id to be parsed
     * @param pos INPUT-OUTPUT parameter.  On input, the position of
     * the first character to parse.  On output, the position after
     * the last character parsed.
     * @param dir the direction.  If the direction is REVERSE then the
     * SingleID is constructed for the reverse direction.
     * @return a SingleID object or null
     */
    static SingleID* parseSingleID(const UnicodeString& id, int32_t& pos,
                                  int32_t dir, UErrorCode& status);

    /**
     * Parse a global filter of the form "[f]" or "([f])", depending
     * on 'withParens'.
     * @param id the pattern the parse
     * @param pos INPUT-OUTPUT parameter.  On input, the position of
     * the first character to parse.  On output, the position after
     * the last character parsed.
     * @param dir the direction.
     * @param withParens INPUT-OUTPUT parameter.  On entry, if
     * withParens[0] is 0, then parens are disallowed.  If it is 1,
     * then parens are requires.  If it is -1, then parens are
     * optional, and the return result will be set to 0 or 1.
     * @param canonID OUTPUT parameter.  The pattern for the filter
     * added to the canonID, either at the end, if dir is FORWARD, or
     * at the start, if dir is REVERSE.  The pattern will be enclosed
     * in parentheses if appropriate, and will be suffixed with an
     * ID_DELIM character.  May be null.
     * @return a UnicodeSet object or null.  A non-null results
     * indicates a successful parse, regardless of whether the filter
     * applies to the given direction.  The caller should discard it
     * if withParens != (dir == REVERSE).
     */
    static UnicodeSet* parseGlobalFilter(const UnicodeString& id, int32_t& pos,
                                         int32_t dir,
                                         int32_t& withParens,
                                         UnicodeString* canonID);

    /**
     * Parse a compound ID, consisting of an optional forward global
     * filter, a separator, one or more single IDs delimited by
     * separators, an an optional reverse global filter.  The
     * separator is a semicolon.  The global filters are UnicodeSet
     * patterns.  The reverse global filter must be enclosed in
     * parentheses.
     * @param id the pattern the parse
     * @param dir the direction.
     * @param canonID OUTPUT parameter that receives the canonical ID,
     * consisting of canonical IDs for all elements, as returned by
     * parseSingleID(), separated by semicolons.  Previous contents
     * are discarded.
     * @param list OUTPUT parameter that receives a list of SingleID
     * objects representing the parsed IDs.  Previous contents are
     * discarded.
     * @param globalFilter OUTPUT parameter that receives a pointer to
     * a newly created global filter for this ID in this direction, or
     * null if there is none.
     * @return true if the parse succeeds, that is, if the entire
     * id is consumed without syntax error.
     */
    static UBool parseCompoundID(const UnicodeString& id, int32_t dir,
                                 UnicodeString& canonID,
                                 UVector& list,
                                 UnicodeSet*& globalFilter);

    /**
     * Convert the elements of the 'list' vector, which are SingleID
     * objects, into actual Transliterator objects.  In the course of
     * this, some (or all) entries may be removed.  If all entries
     * are removed, the Null transliterator will be added.
     *
     * Delete entries with empty basicIDs; these are generated by
     * elements like "(A)" in the forward direction, or "A()" in
     * the reverse.  THIS MAY RESULT IN AN EMPTY VECTOR.  Convert
     * SingleID entries to actual transliterators.
     *
     * Also, optionally, insert the given transliterator at the given
     * position.  This effectively happens before anything else.
     *
     * @param list vector of SingleID objects.  On exit, vector
     * of one or more Transliterators.
     * @param insert Transliterator to insert, or null if none.
     * @param insertIndex index from 0..list.size()-1, at which
     * to place 'insert', or -1 if none.
     * @param ec Output param to receive a success or an error code.
     * @return new value of insertIndex.  The index will shift if
     * there are empty items, like "(Lower)", with indices less than
     * insertIndex.
     */
    static int32_t instantiateList(UVector& list,
                                   Transliterator* insert,
                                   int32_t insertIndex,
                                   UErrorCode& ec);

    /**
     * Parse an ID into pieces.  Take IDs of the form T, T/V, S-T,
     * S-T/V, or S/V-T.  If the source is missing, return a source of
     * ANY.
     * @param id the id string, in any of several forms
     * @param source          the given source.
     * @param target          the given target.
     * @param variant         the given variant
     * @param isSourcePresent If TRUE then the source is present. 
     *                        If the source is not present, ANY will be
     *                        given as the source, and isSourcePresent will be null
     * @return an array of 4 strings: source, target, variant, and
     * isSourcePresent.  If the source is not present, ANY will be
     * given as the source, and isSourcePresent will be null.  Otherwise
     * isSourcePresent will be non-null.  The target may be empty if the
     * id is not well-formed.  The variant may be empty.
     */
    static void IDtoSTV(const UnicodeString& id,
                        UnicodeString& source,
                        UnicodeString& target,
                        UnicodeString& variant,
                        UBool& isSourcePresent);

    /**
     * Given source, target, and variant strings, concatenate them into a
     * full ID.  If the source is empty, then "Any" will be used for the
     * source, so the ID will always be of the form s-t/v or s-t.
     */
    static void STVtoID(const UnicodeString& source,
                        const UnicodeString& target,
                        const UnicodeString& variant,
                        UnicodeString& id);

    /**
     * Register two targets as being inverses of one another.  For
     * example, calling registerSpecialInverse("NFC", "NFD", true) causes
     * Transliterator to form the following inverse relationships:
     *
     * <pre>NFC => NFD
     * Any-NFC => Any-NFD
     * NFD => NFC
     * Any-NFD => Any-NFC</pre>
     *
     * (Without the special inverse registration, the inverse of NFC
     * would be NFC-Any.)  Note that NFD is shorthand for Any-NFD, but
     * that the presence or absence of "Any-" is preserved.
     *
     * <p>The relationship is symmetrical; registering (a, b) is
     * equivalent to registering (b, a).
     *
     * <p>The relevant IDs must still be registered separately as
     * factories or classes.
     *
     * <p>Only the targets are specified.  Special inverses always
     * have the form Any-Target1 <=> Any-Target2.  The target should
     * have canonical casing (the casing desired to be produced when
     * an inverse is formed) and should contain no whitespace or other
     * extraneous characters.
     *
     * @param target the target against which to register the inverse
     * @param inverseTarget the inverse of target, that is
     * Any-target.getInverse() => Any-inverseTarget
     * @param bidirectional if true, register the reverse relation
     * as well, that is, Any-inverseTarget.getInverse() => Any-target
     */
    static void registerSpecialInverse(const UnicodeString& target,
                                       const UnicodeString& inverseTarget,
                                       UBool bidirectional,
                                       UErrorCode &status);

    /**
     * Free static memory.
     */
    static void cleanup();

 private:
    //----------------------------------------------------------------
    // Private implementation
    //----------------------------------------------------------------

    // forbid instantiation
    TransliteratorIDParser();

    /**
     * Parse an ID into component pieces.  Take IDs of the form T,
     * T/V, S-T, S-T/V, or S/V-T.  If the source is missing, return a
     * source of ANY.
     * @param id the id string, in any of several forms
     * @param pos INPUT-OUTPUT parameter.  On input, pos[0] is the
     * offset of the first character to parse in id.  On output,
     * pos[0] is the offset after the last parsed character.  If the
     * parse failed, pos[0] will be unchanged.
     * @param allowFilter if true, a UnicodeSet pattern is allowed
     * at any location between specs or delimiters, and is returned
     * as the fifth string in the array.
     * @return a Specs object, or null if the parse failed.  If
     * neither source nor target was seen in the parsed id, then the
     * parse fails.  If allowFilter is true, then the parsed filter
     * pattern is returned in the Specs object, otherwise the returned
     * filter reference is null.  If the parse fails for any reason
     * null is returned.
     */
    static Specs* parseFilterID(const UnicodeString& id, int32_t& pos,
                                UBool allowFilter);

    /**
     * Givens a Specs object, convert it to a SingleID object.  The
     * Spec object is a more unprocessed parse result.  The SingleID
     * object contains information about canonical and basic IDs.
     * @param specs the given Specs object.
     * @param dir   either FORWARD or REVERSE.
     * @return a SingleID; never returns null.  Returned object always
     * has 'filter' field of null.
     */
    static SingleID* specsToID(const Specs* specs, int32_t dir);

    /**
     * Given a Specs object, return a SingleID representing the
     * special inverse of that ID.  If there is no special inverse
     * then return null.
     * @param specs the given Specs.
     * @return a SingleID or null.  Returned object always has
     * 'filter' field of null.
     */
    static SingleID* specsToSpecialInverse(const Specs& specs, UErrorCode &status);

    /**
     * Glue method to get around access problems in C++.
     * @param id the id string for the transliterator, in any of several forms
     * @param canonID the given canonical ID
     */
    static Transliterator* createBasicInstance(const UnicodeString& id,
                                               const UnicodeString* canonID);

    /**
     * Initialize static memory.
     */
    static void init(UErrorCode &status);

    friend class SingleID;
};

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif