generateIntlCanonicalizeLanguage.py [plain text]

#!/usr/bin/env python

# Copyright (C) 2018 Andy VanWagoner (andy@vanwagoner.family)
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1.  Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
# 2.  Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# This tool processes the IANA file language-subtag-registry.txt to create
# information required to canonicalize language tags according to ECMA 402 and
# RFC 5646 Section 4.5.
# https://www.iana.org/assignments/language-subtag-registry
# https://tc39.github.io/ecma402/#sec-canonicalizelanguagetag
# https://tools.ietf.org/html/rfc5646#section-4.5

import sys
import optparse
import os

header = """// DO NO EDIT! - This file was generated by """ + __file__ + """
"""


footer = """
"""


def openOrExit(path, mode):
    try:
        if sys.version_info.major >= 3:
            return open(path, mode, encoding="UTF-8")
        else:
            return open(path, mode)
    except IOError as e:
        print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
        exit(1)


class SubtagRegistry:
    def __init__(self):
        self.languageMap = {}
        self.extlangMap = {}
        self.regionMap = {}
        self.redundantMap = {}
        self.grandfatheredMap = {}

    def parse(self, file):
        record = {}
        for line in file:
            line = line.strip()
            if line.startswith("#"):
                continue

            if line == "%%":
                self.process(record)
                record = {}
                continue

            if ":" in line:
                key, value = line.split(":", 1)
                record[key.strip()] = value.strip()
            else:
                # Description often continues on the next line
                record[key.strip()] = " " + line
        self.process(record)

    def process(self, record):
        if "File-Date" in record:
            self.fileDate = record["File-Date"]

        if not ("Type" in record):
            return

        type = record["Type"]
        preferred = record.get("Preferred-Value")
        if type == "language" and preferred:
            self.languageMap[record["Subtag"]] = preferred
        elif type == "extlang":
            self.extlangMap[record["Subtag"]] = record["Prefix"]
        elif type == "region" and preferred:
            self.regionMap[record["Subtag"]] = preferred
        elif type == "redundant" and preferred:
            lang = self.extlangMap.get(preferred)
            if "{}-{}".format(lang, preferred) != record["Tag"]:
                self.redundantMap[record["Tag"]] = preferred
        elif type == "variant" and preferred:
            key = "{}-{}".format(record['Prefix'], record['Subtag'])
            if preferred == "alalc97":
                preferred = "ja-Latn-alalc97"
            self.redundantMap[key] = preferred
        elif type == "grandfathered":
            key = record["Tag"].lower()
            value = record.get("Preferred-Value", key)
            self.grandfatheredMap[key] = value

    def dump(self, file):
        if self.fileDate:
            file.write("// language-subtag-registry file date: {}\n".format(self.fileDate))
        file.write("\n#pragma once\n")
        file.write("\n#if ENABLE(INTL)\n")
        file.write("\nnamespace JSC {\n")
        self.dumpLookup(file, "intlPreferredLanguageTag", self.languageMap)
        self.dumpLookup(file, "intlPreferredExtlangTag", self.extlangMap)
        self.dumpLookup(file, "intlPreferredRegionTag", self.regionMap)
        self.dumpLookup(file, "intlRedundantLanguageTag", self.redundantMap)
        self.dumpLookup(file, "intlGrandfatheredLanguageTag", self.grandfatheredMap)
        file.write("\n} // namespace JSC\n")
        file.write("\n#endif // ENABLE(INTL)\n")

    def dumpLookup(self, file, name, map):
        file.write("\nstatic String {}(const String& tag)\n{{\n".format(name))
        file.write("    // {} possible replacements\n".format(len(map)))
        # We could pick the lookup implementation per map if desired
        # Anecdotal perf: if > switch > hash (slowest)
        # Code complexity: switch > if > hash (least complex)
        # Algo complexity: if = O(N) > switch > O(log N) > hash = O(1) (least complex)
        self.dumpIfLookup(file, name, map)
        file.write("}\n")

    def dumpHashLookup(self, file, name, map):
        file.write("    static NeverDestroyed<HashMap<String, String>> cache;\n")
        file.write("    HashMap<String, String>& map = cache.get();\n")
        file.write("    if (UNLIKELY(map.isEmpty())) {\n")
        entries = ["        map.add(\"{}\"_s, \"{}\"_s);\n".format(k, v) for k, v in map.items()]
        entries.sort()
        file.write("".join(entries))
        file.write("    }\n")
        file.write("    return map.get(tag);\n")

    def dumpIfLookup(self, file, name, map):
        entries = ["    if (tag == \"{}\")\n        return \"{}\"_s;".format(k, v) for k, v in map.items()]
        entries.sort()
        file.write("\n".join(entries))
        file.write("\n    return String();\n")

    def dumpSwitchLookup(self, file, name, map):
        tree = {}
        for k, v in map.items():
            node = tree
            for char in k:
                if not (char in node):
                    node[char] = {}
                node = node[char]
            node["value"] = v
        self.dumpSwitchLookupTree(file, tree, 0)
        file.write("\n    return String();\n")

    def dumpSwitchLookupTree(self, file, tree, level):
        indent = "".ljust((level + 1) * 4)
        if "value" in tree:
            file.write(indent + "if (tag.length() == {})\n".format(level))
            file.write(indent + "    return \"{}\"_s;\n".format(tree["value"]))
            del tree["value"]
        keys = tree.keys()
        keys.sort()
        if len(keys) == 0:
            return
        file.write(indent + "switch (tag[{}]) {{\n".format(level))
        for key in keys:
            file.write(indent + "case {}:\n".format(ord(key)))
            self.dumpSwitchLookupTree(file, tree[key], level + 1)
            file.write(indent + "    break;\n")
        file.write(indent + "default: break;\n")
        file.write(indent + "}\n")


if __name__ == "__main__":
    parser = optparse.OptionParser(usage="usage: %prog <language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("<language-subtag-registry.txt> <IntlCanonicalizeLanguage.h>")

    registryPath = args[0]
    intlCanonHPath = args[1]

    registryFile = openOrExit(registryPath, "r")
    intlCanonHFile = openOrExit(intlCanonHPath, "w")

    intlCanonHFile.write(header)

    registry = SubtagRegistry()
    registry.parse(registryFile)
    registry.dump(intlCanonHFile)

    intlCanonHFile.write(footer)

    exit(0)