LocaleToScriptMappingDefault.cpp   [plain text]


/*
 * Copyright (C) 2011 Google Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "LocaleToScriptMapping.h"

#include <wtf/HashMap.h>
#include <wtf/NeverDestroyed.h>
#include <wtf/text/StringHash.h>

namespace WebCore {

struct ScriptNameCode {
    ASCIILiteral name;
    UScriptCode code;
};

// This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
// treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
// USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
// using the same font setting.
static const ScriptNameCode scriptNameCodeList[] = {
    { "zyyy"_s, USCRIPT_COMMON },
    { "qaai"_s, USCRIPT_INHERITED },
    { "arab"_s, USCRIPT_ARABIC },
    { "armn"_s, USCRIPT_ARMENIAN },
    { "beng"_s, USCRIPT_BENGALI },
    { "bopo"_s, USCRIPT_BOPOMOFO },
    { "cher"_s, USCRIPT_CHEROKEE },
    { "copt"_s, USCRIPT_COPTIC },
    { "cyrl"_s, USCRIPT_CYRILLIC },
    { "dsrt"_s, USCRIPT_DESERET },
    { "deva"_s, USCRIPT_DEVANAGARI },
    { "ethi"_s, USCRIPT_ETHIOPIC },
    { "geor"_s, USCRIPT_GEORGIAN },
    { "goth"_s, USCRIPT_GOTHIC },
    { "grek"_s, USCRIPT_GREEK },
    { "gujr"_s, USCRIPT_GUJARATI },
    { "guru"_s, USCRIPT_GURMUKHI },
    { "hani"_s, USCRIPT_HAN },
    { "hang"_s, USCRIPT_HANGUL },
    { "hebr"_s, USCRIPT_HEBREW },
    { "hira"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
    { "knda"_s, USCRIPT_KANNADA },
    { "kana"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
    { "khmr"_s, USCRIPT_KHMER },
    { "laoo"_s, USCRIPT_LAO },
    { "latn"_s, USCRIPT_LATIN },
    { "mlym"_s, USCRIPT_MALAYALAM },
    { "mong"_s, USCRIPT_MONGOLIAN },
    { "mymr"_s, USCRIPT_MYANMAR },
    { "ogam"_s, USCRIPT_OGHAM },
    { "ital"_s, USCRIPT_OLD_ITALIC },
    { "orya"_s, USCRIPT_ORIYA },
    { "runr"_s, USCRIPT_RUNIC },
    { "sinh"_s, USCRIPT_SINHALA },
    { "syrc"_s, USCRIPT_SYRIAC },
    { "taml"_s, USCRIPT_TAMIL },
    { "telu"_s, USCRIPT_TELUGU },
    { "thaa"_s, USCRIPT_THAANA },
    { "thai"_s, USCRIPT_THAI },
    { "tibt"_s, USCRIPT_TIBETAN },
    { "cans"_s, USCRIPT_CANADIAN_ABORIGINAL },
    { "yiii"_s, USCRIPT_YI },
    { "tglg"_s, USCRIPT_TAGALOG },
    { "hano"_s, USCRIPT_HANUNOO },
    { "buhd"_s, USCRIPT_BUHID },
    { "tagb"_s, USCRIPT_TAGBANWA },
    { "brai"_s, USCRIPT_BRAILLE },
    { "cprt"_s, USCRIPT_CYPRIOT },
    { "limb"_s, USCRIPT_LIMBU },
    { "linb"_s, USCRIPT_LINEAR_B },
    { "osma"_s, USCRIPT_OSMANYA },
    { "shaw"_s, USCRIPT_SHAVIAN },
    { "tale"_s, USCRIPT_TAI_LE },
    { "ugar"_s, USCRIPT_UGARITIC },
    { "hrkt"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
    { "bugi"_s, USCRIPT_BUGINESE },
    { "glag"_s, USCRIPT_GLAGOLITIC },
    { "khar"_s, USCRIPT_KHAROSHTHI },
    { "sylo"_s, USCRIPT_SYLOTI_NAGRI },
    { "talu"_s, USCRIPT_NEW_TAI_LUE },
    { "tfng"_s, USCRIPT_TIFINAGH },
    { "xpeo"_s, USCRIPT_OLD_PERSIAN },
    { "bali"_s, USCRIPT_BALINESE },
    { "batk"_s, USCRIPT_BATAK },
    { "blis"_s, USCRIPT_BLISSYMBOLS },
    { "brah"_s, USCRIPT_BRAHMI },
    { "cham"_s, USCRIPT_CHAM },
    { "cirt"_s, USCRIPT_CIRTH },
    { "cyrs"_s, USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
    { "egyd"_s, USCRIPT_DEMOTIC_EGYPTIAN },
    { "egyh"_s, USCRIPT_HIERATIC_EGYPTIAN },
    { "egyp"_s, USCRIPT_EGYPTIAN_HIEROGLYPHS },
    { "geok"_s, USCRIPT_KHUTSURI },
    { "hans"_s, USCRIPT_SIMPLIFIED_HAN },
    { "hant"_s, USCRIPT_TRADITIONAL_HAN },
    { "hmng"_s, USCRIPT_PAHAWH_HMONG },
    { "hung"_s, USCRIPT_OLD_HUNGARIAN },
    { "inds"_s, USCRIPT_HARAPPAN_INDUS },
    { "java"_s, USCRIPT_JAVANESE },
    { "kali"_s, USCRIPT_KAYAH_LI },
    { "latf"_s, USCRIPT_LATIN_FRAKTUR },
    { "latg"_s, USCRIPT_LATIN_GAELIC },
    { "lepc"_s, USCRIPT_LEPCHA },
    { "lina"_s, USCRIPT_LINEAR_A },
    { "mand"_s, USCRIPT_MANDAEAN },
    { "maya"_s, USCRIPT_MAYAN_HIEROGLYPHS },
    { "mero"_s, USCRIPT_MEROITIC },
    { "nkoo"_s, USCRIPT_NKO },
    { "orkh"_s, USCRIPT_ORKHON },
    { "perm"_s, USCRIPT_OLD_PERMIC },
    { "phag"_s, USCRIPT_PHAGS_PA },
    { "phnx"_s, USCRIPT_PHOENICIAN },
    { "plrd"_s, USCRIPT_PHONETIC_POLLARD },
    { "roro"_s, USCRIPT_RONGORONGO },
    { "sara"_s, USCRIPT_SARATI },
    { "syre"_s, USCRIPT_ESTRANGELO_SYRIAC },
    { "syrj"_s, USCRIPT_WESTERN_SYRIAC },
    { "syrn"_s, USCRIPT_EASTERN_SYRIAC },
    { "teng"_s, USCRIPT_TENGWAR },
    { "vaii"_s, USCRIPT_VAI },
    { "visp"_s, USCRIPT_VISIBLE_SPEECH },
    { "xsux"_s, USCRIPT_CUNEIFORM },
    { "jpan"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
    { "kore"_s, USCRIPT_HANGUL },
    { "zxxx"_s, USCRIPT_UNWRITTEN_LANGUAGES },
    { "zzzz"_s, USCRIPT_UNKNOWN }
};

struct ScriptNameCodeMapHashTraits : public HashTraits<String> {
    static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(scriptNameCodeList)>::value;
};

UScriptCode scriptNameToCode(const String& scriptName)
{
    static const auto scriptNameCodeMap = makeNeverDestroyed([] {
        HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, ScriptNameCodeMapHashTraits> map;
        for (auto& nameAndCode : scriptNameCodeList)
            map.add(nameAndCode.name, nameAndCode.code);
        return map;
    }());

    auto it = scriptNameCodeMap.get().find(scriptName);
    if (it != scriptNameCodeMap.get().end())
        return it->value;
    return USCRIPT_INVALID_CODE;
}

struct LocaleScript {
    ASCIILiteral locale;
    UScriptCode script;
};

static const LocaleScript localeScriptList[] = {
    { "aa"_s, USCRIPT_LATIN },
    { "ab"_s, USCRIPT_CYRILLIC },
    { "ady"_s, USCRIPT_CYRILLIC },
    { "af"_s, USCRIPT_LATIN },
    { "ak"_s, USCRIPT_LATIN },
    { "am"_s, USCRIPT_ETHIOPIC },
    { "ar"_s, USCRIPT_ARABIC },
    { "as"_s, USCRIPT_BENGALI },
    { "ast"_s, USCRIPT_LATIN },
    { "av"_s, USCRIPT_CYRILLIC },
    { "ay"_s, USCRIPT_LATIN },
    { "az"_s, USCRIPT_LATIN },
    { "ba"_s, USCRIPT_CYRILLIC },
    { "be"_s, USCRIPT_CYRILLIC },
    { "bg"_s, USCRIPT_CYRILLIC },
    { "bi"_s, USCRIPT_LATIN },
    { "bn"_s, USCRIPT_BENGALI },
    { "bo"_s, USCRIPT_TIBETAN },
    { "bs"_s, USCRIPT_LATIN },
    { "ca"_s, USCRIPT_LATIN },
    { "ce"_s, USCRIPT_CYRILLIC },
    { "ceb"_s, USCRIPT_LATIN },
    { "ch"_s, USCRIPT_LATIN },
    { "chk"_s, USCRIPT_LATIN },
    { "cs"_s, USCRIPT_LATIN },
    { "cy"_s, USCRIPT_LATIN },
    { "da"_s, USCRIPT_LATIN },
    { "de"_s, USCRIPT_LATIN },
    { "dv"_s, USCRIPT_THAANA },
    { "dz"_s, USCRIPT_TIBETAN },
    { "ee"_s, USCRIPT_LATIN },
    { "efi"_s, USCRIPT_LATIN },
    { "el"_s, USCRIPT_GREEK },
    { "en"_s, USCRIPT_LATIN },
    { "es"_s, USCRIPT_LATIN },
    { "et"_s, USCRIPT_LATIN },
    { "eu"_s, USCRIPT_LATIN },
    { "fa"_s, USCRIPT_ARABIC },
    { "fi"_s, USCRIPT_LATIN },
    { "fil"_s, USCRIPT_LATIN },
    { "fj"_s, USCRIPT_LATIN },
    { "fo"_s, USCRIPT_LATIN },
    { "fr"_s, USCRIPT_LATIN },
    { "fur"_s, USCRIPT_LATIN },
    { "fy"_s, USCRIPT_LATIN },
    { "ga"_s, USCRIPT_LATIN },
    { "gaa"_s, USCRIPT_LATIN },
    { "gd"_s, USCRIPT_LATIN },
    { "gil"_s, USCRIPT_LATIN },
    { "gl"_s, USCRIPT_LATIN },
    { "gn"_s, USCRIPT_LATIN },
    { "gsw"_s, USCRIPT_LATIN },
    { "gu"_s, USCRIPT_GUJARATI },
    { "ha"_s, USCRIPT_LATIN },
    { "haw"_s, USCRIPT_LATIN },
    { "he"_s, USCRIPT_HEBREW },
    { "hi"_s, USCRIPT_DEVANAGARI },
    { "hil"_s, USCRIPT_LATIN },
    { "ho"_s, USCRIPT_LATIN },
    { "hr"_s, USCRIPT_LATIN },
    { "ht"_s, USCRIPT_LATIN },
    { "hu"_s, USCRIPT_LATIN },
    { "hy"_s, USCRIPT_ARMENIAN },
    { "id"_s, USCRIPT_LATIN },
    { "ig"_s, USCRIPT_LATIN },
    { "ii"_s, USCRIPT_YI },
    { "ilo"_s, USCRIPT_LATIN },
    { "inh"_s, USCRIPT_CYRILLIC },
    { "is"_s, USCRIPT_LATIN },
    { "it"_s, USCRIPT_LATIN },
    { "iu"_s, USCRIPT_CANADIAN_ABORIGINAL },
    { "ja"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
    { "jv"_s, USCRIPT_LATIN },
    { "ka"_s, USCRIPT_GEORGIAN },
    { "kaj"_s, USCRIPT_LATIN },
    { "kam"_s, USCRIPT_LATIN },
    { "kbd"_s, USCRIPT_CYRILLIC },
    { "kha"_s, USCRIPT_LATIN },
    { "kk"_s, USCRIPT_CYRILLIC },
    { "kl"_s, USCRIPT_LATIN },
    { "km"_s, USCRIPT_KHMER },
    { "kn"_s, USCRIPT_KANNADA },
    { "ko"_s, USCRIPT_HANGUL },
    { "kok"_s, USCRIPT_DEVANAGARI },
    { "kos"_s, USCRIPT_LATIN },
    { "kpe"_s, USCRIPT_LATIN },
    { "krc"_s, USCRIPT_CYRILLIC },
    { "ks"_s, USCRIPT_ARABIC },
    { "ku"_s, USCRIPT_ARABIC },
    { "kum"_s, USCRIPT_CYRILLIC },
    { "ky"_s, USCRIPT_CYRILLIC },
    { "la"_s, USCRIPT_LATIN },
    { "lah"_s, USCRIPT_ARABIC },
    { "lb"_s, USCRIPT_LATIN },
    { "lez"_s, USCRIPT_CYRILLIC },
    { "ln"_s, USCRIPT_LATIN },
    { "lo"_s, USCRIPT_LAO },
    { "lt"_s, USCRIPT_LATIN },
    { "lv"_s, USCRIPT_LATIN },
    { "mai"_s, USCRIPT_DEVANAGARI },
    { "mdf"_s, USCRIPT_CYRILLIC },
    { "mg"_s, USCRIPT_LATIN },
    { "mh"_s, USCRIPT_LATIN },
    { "mi"_s, USCRIPT_LATIN },
    { "mk"_s, USCRIPT_CYRILLIC },
    { "ml"_s, USCRIPT_MALAYALAM },
    { "mn"_s, USCRIPT_CYRILLIC },
    { "mr"_s, USCRIPT_DEVANAGARI },
    { "ms"_s, USCRIPT_LATIN },
    { "mt"_s, USCRIPT_LATIN },
    { "my"_s, USCRIPT_MYANMAR },
    { "myv"_s, USCRIPT_CYRILLIC },
    { "na"_s, USCRIPT_LATIN },
    { "nb"_s, USCRIPT_LATIN },
    { "ne"_s, USCRIPT_DEVANAGARI },
    { "niu"_s, USCRIPT_LATIN },
    { "nl"_s, USCRIPT_LATIN },
    { "nn"_s, USCRIPT_LATIN },
    { "nr"_s, USCRIPT_LATIN },
    { "nso"_s, USCRIPT_LATIN },
    { "ny"_s, USCRIPT_LATIN },
    { "oc"_s, USCRIPT_LATIN },
    { "om"_s, USCRIPT_LATIN },
    { "or"_s, USCRIPT_ORIYA },
    { "os"_s, USCRIPT_CYRILLIC },
    { "pa"_s, USCRIPT_GURMUKHI },
    { "pag"_s, USCRIPT_LATIN },
    { "pap"_s, USCRIPT_LATIN },
    { "pau"_s, USCRIPT_LATIN },
    { "pl"_s, USCRIPT_LATIN },
    { "pon"_s, USCRIPT_LATIN },
    { "ps"_s, USCRIPT_ARABIC },
    { "pt"_s, USCRIPT_LATIN },
    { "qu"_s, USCRIPT_LATIN },
    { "rm"_s, USCRIPT_LATIN },
    { "rn"_s, USCRIPT_LATIN },
    { "ro"_s, USCRIPT_LATIN },
    { "ru"_s, USCRIPT_CYRILLIC },
    { "rw"_s, USCRIPT_LATIN },
    { "sa"_s, USCRIPT_DEVANAGARI },
    { "sah"_s, USCRIPT_CYRILLIC },
    { "sat"_s, USCRIPT_LATIN },
    { "sd"_s, USCRIPT_ARABIC },
    { "se"_s, USCRIPT_LATIN },
    { "sg"_s, USCRIPT_LATIN },
    { "si"_s, USCRIPT_SINHALA },
    { "sid"_s, USCRIPT_LATIN },
    { "sk"_s, USCRIPT_LATIN },
    { "sl"_s, USCRIPT_LATIN },
    { "sm"_s, USCRIPT_LATIN },
    { "so"_s, USCRIPT_LATIN },
    { "sq"_s, USCRIPT_LATIN },
    { "sr"_s, USCRIPT_CYRILLIC },
    { "ss"_s, USCRIPT_LATIN },
    { "st"_s, USCRIPT_LATIN },
    { "su"_s, USCRIPT_LATIN },
    { "sv"_s, USCRIPT_LATIN },
    { "sw"_s, USCRIPT_LATIN },
    { "ta"_s, USCRIPT_TAMIL },
    { "te"_s, USCRIPT_TELUGU },
    { "tet"_s, USCRIPT_LATIN },
    { "tg"_s, USCRIPT_CYRILLIC },
    { "th"_s, USCRIPT_THAI },
    { "ti"_s, USCRIPT_ETHIOPIC },
    { "tig"_s, USCRIPT_ETHIOPIC },
    { "tk"_s, USCRIPT_LATIN },
    { "tkl"_s, USCRIPT_LATIN },
    { "tl"_s, USCRIPT_LATIN },
    { "tn"_s, USCRIPT_LATIN },
    { "to"_s, USCRIPT_LATIN },
    { "tpi"_s, USCRIPT_LATIN },
    { "tr"_s, USCRIPT_LATIN },
    { "trv"_s, USCRIPT_LATIN },
    { "ts"_s, USCRIPT_LATIN },
    { "tt"_s, USCRIPT_CYRILLIC },
    { "tvl"_s, USCRIPT_LATIN },
    { "tw"_s, USCRIPT_LATIN },
    { "ty"_s, USCRIPT_LATIN },
    { "tyv"_s, USCRIPT_CYRILLIC },
    { "udm"_s, USCRIPT_CYRILLIC },
    { "ug"_s, USCRIPT_ARABIC },
    { "uk"_s, USCRIPT_CYRILLIC },
    { "und"_s, USCRIPT_LATIN },
    { "ur"_s, USCRIPT_ARABIC },
    { "uz"_s, USCRIPT_CYRILLIC },
    { "ve"_s, USCRIPT_LATIN },
    { "vi"_s, USCRIPT_LATIN },
    { "wal"_s, USCRIPT_ETHIOPIC },
    { "war"_s, USCRIPT_LATIN },
    { "wo"_s, USCRIPT_LATIN },
    { "xh"_s, USCRIPT_LATIN },
    { "yap"_s, USCRIPT_LATIN },
    { "yo"_s, USCRIPT_LATIN },
    { "za"_s, USCRIPT_LATIN },
    { "zh"_s, USCRIPT_HAN },
    { "zh_hk"_s, USCRIPT_TRADITIONAL_HAN },
    { "zh_tw"_s, USCRIPT_TRADITIONAL_HAN },
    { "zu"_s, USCRIPT_LATIN }
};

struct LocaleScriptMapHashTraits : public HashTraits<String> {
    static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(localeScriptList)>::value;
};

UScriptCode localeToScriptCodeForFontSelection(const String& locale)
{
    static const auto localeScriptMap = makeNeverDestroyed([] {
        HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, LocaleScriptMapHashTraits> map;
        for (auto& localeAndScript : localeScriptList)
            map.add(localeAndScript.locale, localeAndScript.script);
        return map;
    }());

    String canonicalLocale = locale;
    canonicalLocale.replace('-', '_');
    while (!canonicalLocale.isEmpty()) {
        auto it = localeScriptMap.get().find(canonicalLocale);
        if (it != localeScriptMap.get().end())
            return it->value;
        auto underscorePosition = canonicalLocale.reverseFind('_');
        if (underscorePosition == notFound)
            break;
        UScriptCode code = scriptNameToCode(canonicalLocale.substring(underscorePosition + 1));
        if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN)
            return code;
        canonicalLocale = canonicalLocale.substring(0, underscorePosition);
    }
    return USCRIPT_COMMON;
}

} // namespace WebCore