identifier_info.h   [plain text]


/*
**********************************************************************
*   Copyright (C) 2014, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
* indentifier_info.h
* 
* created on: 2013 Jan 7
* created by: Andy Heninger
*/

#ifndef __IDENTIFIER_INFO_H__
#define __IDENTIFIER_INFO_H__

#include "unicode/utypes.h"

#include "unicode/uniset.h"
#include "unicode/uspoof.h"
#include "uhash.h"

U_NAMESPACE_BEGIN

class ScriptSet;

// TODO(andy): review consistency of reference vs pointer arguments to the funcions.

/**
 * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
 * then setIdentifier. Available methods include:
 * <ol>
 * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
 * each of these.
 * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
 * either Katakana or Hiragana.
 * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
 * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
 * the identifier.
 * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
 * </ol>
 * 
 * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
 */
class U_I18N_API IdentifierInfo : public UMemory {

  public:
    /**
     * Create an identifier info object. Subsequently, call setIdentifier(), etc.
     * @internal
     */
    IdentifierInfo(UErrorCode &status);

    /**
      * Destructor
      */
    virtual ~IdentifierInfo();

  private:
    /* Disallow copying for now. Can be added if there's a need. */
    IdentifierInfo(const IdentifierInfo &other);

  public:
     
    /**
     * Set the identifier profile: the characters that are to be allowed in the identifier.
     * 
     * @param identifierProfile the characters that are to be allowed in the identifier
     * @return this
     * @internal
     */
    IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);

    /**
     * Get the identifier profile: the characters that are to be allowed in the identifier.
     * 
     * @return The characters that are to be allowed in the identifier.
     * @internal
     */
    const UnicodeSet &getIdentifierProfile() const;


    /**
     * Set an identifier to analyze. Afterwards, call methods like getScripts()
     * 
     * @param identifier the identifier to analyze
     * @param status Errorcode, set if errors occur.
     * @return this
     * @internal
     */
    IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);


    /**
     * Get the identifier that was analyzed. The returned string is owned by the ICU library,
     * and must not be deleted by the caller.
     * 
     * @return the identifier that was analyzed.
     * @internal
     */
    const UnicodeString *getIdentifier() const;
    

    /**
     * Get the scripts found in the identifiers.
     * 
     * @return the set of explicit scripts.
     * @internal
     */
    const ScriptSet *getScripts() const;

    /**
     * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
     * the set consisting of those scripts will be returned.
     * 
     * @return a uhash, with each key being of type (ScriptSet *). 
     *         This is a set, not a map, so the value stored in the uhash is not relevant.
     *         (It is, in fact, 1).
     *         Ownership of the uhash and its contents remains with the IndetifierInfo object, 
     *         and remains valid until a new identifer is set or until the object is deleted.
     * @internal
     */
    const UHashtable *getAlternates() const;

    /**
     * Get the representative characters (zeros) for the numerics found in the identifier.
     * 
     * @return the set of explicit scripts.
     * @internal
     */
    const UnicodeSet *getNumerics() const;

    /**
     * Find out which scripts are in common among the alternates.
     * 
     * @return the set of scripts that are in common among the alternates.
     * @internal
     */
    const ScriptSet *getCommonAmongAlternates() const;

    /**
      * Get the number of scripts appearing in the identifier.
      *   Note: Common and Inherited scripts are omitted from the count.
      *   Note: Result may be high when the identifier contains characters
      *         with alternate scripts. The distinction between
      *         0, 1 and > 1 will remain valid, however.
      * @return the number of scripts.
      */
    int32_t getScriptCount() const;

#if !UCONFIG_NO_NORMALIZATION

    /**
     * Find the "tightest" restriction level that the identifier satisfies.
     * 
     * @return the restriction level.
     * @internal
     */
    URestrictionLevel getRestrictionLevel(UErrorCode &status) const;

#endif /*!UCONFIG_NO_NORMALIZATION */

    UnicodeString toString() const;

    /**
     * Produce a readable string of alternates.
     * 
     * @param alternates a UHashtable of UScriptSets.
     *        Keys only, no meaningful values in the UHash.
     * @return display form
     * @internal
     */
    static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);

  private:

    IdentifierInfo  & clear();
    UBool             containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;

    UnicodeString     *fIdentifier;
    ScriptSet         *fRequiredScripts;
    UHashtable        *fScriptSetSet;
    ScriptSet         *fCommonAmongAlternates;
    UnicodeSet        *fNumerics;
    UnicodeSet        *fIdentifierProfile;
};

U_NAMESPACE_END

#endif // __IDENTIFIER_INFO_H__