idna.h   [plain text]


/*
*******************************************************************************
*   Copyright (C) 2010-2012, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  idna.h
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   created on: 2010mar05
*   created by: Markus W. Scherer
*/

#ifndef __IDNA_H__
#define __IDNA_H__

/**
 * \file
 * \brief C++ API: Internationalizing Domain Names in Applications (IDNA)
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_IDNA

#include "unicode/bytestream.h"
#include "unicode/stringpiece.h"
#include "unicode/uidna.h"
#include "unicode/unistr.h"

U_NAMESPACE_BEGIN

class U_COMMON_API IDNAInfo;

/**
 * Abstract base class for IDNA processing.
 * See http://www.unicode.org/reports/tr46/
 * and http://www.ietf.org/rfc/rfc3490.txt
 *
 * The IDNA class is not intended for public subclassing.
 *
 * This C++ API currently only implements UTS #46.
 * The uidna.h C API implements both UTS #46 (functions using UIDNA service object)
 * and IDNA2003 (functions that do not use a service object).
 * @stable ICU 4.6
 */
class U_COMMON_API IDNA : public UObject {
public:
    /**
     * Destructor.
     * @stable ICU 4.6
     */
    ~IDNA();

    /**
     * Returns an IDNA instance which implements UTS #46.
     * Returns an unmodifiable instance, owned by the caller.
     * Cache it for multiple operations, and delete it when done.
     * The instance is thread-safe, that is, it can be used concurrently.
     *
     * UTS #46 defines Unicode IDNA Compatibility Processing,
     * updated to the latest version of Unicode and compatible with both
     * IDNA2003 and IDNA2008.
     *
     * The worker functions use transitional processing, including deviation mappings,
     * unless UIDNA_NONTRANSITIONAL_TO_ASCII or UIDNA_NONTRANSITIONAL_TO_UNICODE
     * is used in which case the deviation characters are passed through without change.
     *
     * Disallowed characters are mapped to U+FFFD.
     *
     * For available options see the uidna.h header.
     * Operations with the UTS #46 instance do not support the
     * UIDNA_ALLOW_UNASSIGNED option.
     *
     * By default, the UTS #46 implementation allows all ASCII characters (as valid or mapped).
     * When the UIDNA_USE_STD3_RULES option is used, ASCII characters other than
     * letters, digits, hyphen (LDH) and dot/full stop are disallowed and mapped to U+FFFD.
     *
     * @param options Bit set to modify the processing and error checking.
     *                See option bit set values in uidna.h.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return the UTS #46 IDNA instance, if successful
     * @stable ICU 4.6
     */
    static IDNA *
    createUTS46Instance(uint32_t options, UErrorCode &errorCode);

    /**
     * Converts a single domain name label into its ASCII form for DNS lookup.
     * If any processing step fails, then info.hasErrors() will be TRUE and
     * the result might not be an ASCII string.
     * The label might be modified according to the types of errors.
     * Labels with severe errors will be left in (or turned into) their Unicode form.
     *
     * The UErrorCode indicates an error only in exceptional cases,
     * such as a U_MEMORY_ALLOCATION_ERROR.
     *
     * @param label Input domain name label
     * @param dest Destination string object
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual UnicodeString &
    labelToASCII(const UnicodeString &label, UnicodeString &dest,
                 IDNAInfo &info, UErrorCode &errorCode) const = 0;

    /**
     * Converts a single domain name label into its Unicode form for human-readable display.
     * If any processing step fails, then info.hasErrors() will be TRUE.
     * The label might be modified according to the types of errors.
     *
     * The UErrorCode indicates an error only in exceptional cases,
     * such as a U_MEMORY_ALLOCATION_ERROR.
     *
     * @param label Input domain name label
     * @param dest Destination string object
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual UnicodeString &
    labelToUnicode(const UnicodeString &label, UnicodeString &dest,
                   IDNAInfo &info, UErrorCode &errorCode) const = 0;

    /**
     * Converts a whole domain name into its ASCII form for DNS lookup.
     * If any processing step fails, then info.hasErrors() will be TRUE and
     * the result might not be an ASCII string.
     * The domain name might be modified according to the types of errors.
     * Labels with severe errors will be left in (or turned into) their Unicode form.
     *
     * The UErrorCode indicates an error only in exceptional cases,
     * such as a U_MEMORY_ALLOCATION_ERROR.
     *
     * @param name Input domain name
     * @param dest Destination string object
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual UnicodeString &
    nameToASCII(const UnicodeString &name, UnicodeString &dest,
                IDNAInfo &info, UErrorCode &errorCode) const = 0;

    /**
     * Converts a whole domain name into its Unicode form for human-readable display.
     * If any processing step fails, then info.hasErrors() will be TRUE.
     * The domain name might be modified according to the types of errors.
     *
     * The UErrorCode indicates an error only in exceptional cases,
     * such as a U_MEMORY_ALLOCATION_ERROR.
     *
     * @param name Input domain name
     * @param dest Destination string object
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual UnicodeString &
    nameToUnicode(const UnicodeString &name, UnicodeString &dest,
                  IDNAInfo &info, UErrorCode &errorCode) const = 0;

    // UTF-8 versions of the processing methods ---------------------------- ***

    /**
     * Converts a single domain name label into its ASCII form for DNS lookup.
     * UTF-8 version of labelToASCII(), same behavior.
     *
     * @param label Input domain name label
     * @param dest Destination byte sink; Flush()ed if successful
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual void
    labelToASCII_UTF8(const StringPiece &label, ByteSink &dest,
                      IDNAInfo &info, UErrorCode &errorCode) const;

    /**
     * Converts a single domain name label into its Unicode form for human-readable display.
     * UTF-8 version of labelToUnicode(), same behavior.
     *
     * @param label Input domain name label
     * @param dest Destination byte sink; Flush()ed if successful
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual void
    labelToUnicodeUTF8(const StringPiece &label, ByteSink &dest,
                       IDNAInfo &info, UErrorCode &errorCode) const;

    /**
     * Converts a whole domain name into its ASCII form for DNS lookup.
     * UTF-8 version of nameToASCII(), same behavior.
     *
     * @param name Input domain name
     * @param dest Destination byte sink; Flush()ed if successful
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual void
    nameToASCII_UTF8(const StringPiece &name, ByteSink &dest,
                     IDNAInfo &info, UErrorCode &errorCode) const;

    /**
     * Converts a whole domain name into its Unicode form for human-readable display.
     * UTF-8 version of nameToUnicode(), same behavior.
     *
     * @param name Input domain name
     * @param dest Destination byte sink; Flush()ed if successful
     * @param info Output container of IDNA processing details.
     * @param errorCode Standard ICU error code. Its input value must
     *                  pass the U_SUCCESS() test, or else the function returns
     *                  immediately. Check for U_FAILURE() on output or use with
     *                  function chaining. (See User Guide for details.)
     * @return dest
     * @stable ICU 4.6
     */
    virtual void
    nameToUnicodeUTF8(const StringPiece &name, ByteSink &dest,
                      IDNAInfo &info, UErrorCode &errorCode) const;

private:
    // No ICU "poor man's RTTI" for this class nor its subclasses.
    virtual UClassID getDynamicClassID() const;
};

class UTS46;

/**
 * Output container for IDNA processing errors.
 * The IDNAInfo class is not suitable for subclassing.
 * @stable ICU 4.6
 */
class U_COMMON_API IDNAInfo : public UMemory {
public:
    /**
     * Constructor for stack allocation.
     * @stable ICU 4.6
     */
    IDNAInfo() : errors(0), labelErrors(0), isTransDiff(FALSE), isBiDi(FALSE), isOkBiDi(TRUE) {}
    /**
     * Were there IDNA processing errors?
     * @return TRUE if there were processing errors
     * @stable ICU 4.6
     */
    UBool hasErrors() const { return errors!=0; }
    /**
     * Returns a bit set indicating IDNA processing errors.
     * See UIDNA_ERROR_... constants in uidna.h.
     * @return bit set of processing errors
     * @stable ICU 4.6
     */
    uint32_t getErrors() const { return errors; }
    /**
     * Returns TRUE if transitional and nontransitional processing produce different results.
     * This is the case when the input label or domain name contains
     * one or more deviation characters outside a Punycode label (see UTS #46).
     * <ul>
     * <li>With nontransitional processing, such characters are
     * copied to the destination string.
     * <li>With transitional processing, such characters are
     * mapped (sharp s/sigma) or removed (joiner/nonjoiner).
     * </ul>
     * @return TRUE if transitional and nontransitional processing produce different results
     * @stable ICU 4.6
     */
    UBool isTransitionalDifferent() const { return isTransDiff; }

private:
    friend class UTS46;

    IDNAInfo(const IDNAInfo &other);  // no copying
    IDNAInfo &operator=(const IDNAInfo &other);  // no copying

    void reset() {
        errors=labelErrors=0;
        isTransDiff=FALSE;
        isBiDi=FALSE;
        isOkBiDi=TRUE;
    }

    uint32_t errors, labelErrors;
    UBool isTransDiff;
    UBool isBiDi;
    UBool isOkBiDi;
};

U_NAMESPACE_END

#endif  // UCONFIG_NO_IDNA
#endif  // __IDNA_H__