#!/usr/bin/env python # Copyright (C) 2018 Andy VanWagoner (andy@vanwagoner.family) # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # This tool processes the IANA file language-subtag-registry.txt to create # information required to canonicalize language tags according to ECMA 402 and # RFC 5646 Section 4.5. # https://www.iana.org/assignments/language-subtag-registry # https://tc39.github.io/ecma402/#sec-canonicalizelanguagetag # https://tools.ietf.org/html/rfc5646#section-4.5 import sys import optparse import os header = """// DO NO EDIT! - This file was generated by """ + __file__ + """ """ footer = """ """ def openOrExit(path, mode): try: return open(path, mode) except IOError as e: print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)) exit(1) class SubtagRegistry: def __init__(self): self.languageMap = {} self.extlangMap = {} self.regionMap = {} self.redundantMap = {} self.grandfatheredMap = {} def parse(self, file): record = {} for line in file: line = line.strip() if line.startswith("#"): continue if line == "%%": self.process(record) record = {} continue if ":" in line: key, value = line.split(":", 1) record[key.strip()] = value.strip() else: # Description often continues on the next line record[key.strip()] = " " + line self.process(record) def process(self, record): if "File-Date" in record: self.fileDate = record["File-Date"] if not ("Type" in record): return type = record["Type"] preferred = record.get("Preferred-Value") if type == "language" and preferred: self.languageMap[record["Subtag"]] = preferred elif type == "extlang": self.extlangMap[record["Subtag"]] = record["Prefix"] elif type == "region" and preferred: self.regionMap[record["Subtag"]] = preferred elif type == "redundant" and preferred: lang = self.extlangMap.get(preferred) if "{}-{}".format(lang, preferred) != record["Tag"]: self.redundantMap[record["Tag"]] = preferred elif type == "variant" and preferred: key = "{}-{}".format(record['Prefix'], record['Subtag']) if preferred == "alalc97": preferred = "ja-Latn-alalc97" self.redundantMap[key] = preferred elif type == "grandfathered": key = record["Tag"].lower() value = record.get("Preferred-Value", key) self.grandfatheredMap[key] = value def dump(self, file): if self.fileDate: file.write("// language-subtag-registry file date: {}\n".format(self.fileDate)) file.write("\n#pragma once\n") file.write("\n#if ENABLE(INTL)\n") file.write("\nnamespace JSC {\n") self.dumpLookup(file, "intlPreferredLanguageTag", self.languageMap) self.dumpLookup(file, "intlPreferredExtlangTag", self.extlangMap) self.dumpLookup(file, "intlPreferredRegionTag", self.regionMap) self.dumpLookup(file, "intlRedundantLanguageTag", self.redundantMap) self.dumpLookup(file, "intlGrandfatheredLanguageTag", self.grandfatheredMap) file.write("\n} // namespace JSC\n") file.write("\n#endif // ENABLE(INTL)\n") def dumpLookup(self, file, name, map): file.write("\nstatic String {}(const String& tag)\n{{\n".format(name)) file.write(" // {} possible replacements\n".format(len(map))) # We could pick the lookup implementation per map if desired # Anecdotal perf: if > switch > hash (slowest) # Code complexity: switch > if > hash (least complex) # Algo complexity: if = O(N) > switch > O(log N) > hash = O(1) (least complex) self.dumpIfLookup(file, name, map) file.write("}\n") def dumpHashLookup(self, file, name, map): file.write(" static NeverDestroyed> cache;\n") file.write(" HashMap& map = cache.get();\n") file.write(" if (UNLIKELY(map.isEmpty())) {\n") entries = [" map.add(\"{}\"_s, \"{}\"_s);\n".format(k, v) for k, v in map.items()] entries.sort() file.write("".join(entries)) file.write(" }\n") file.write(" return map.get(tag);\n") def dumpIfLookup(self, file, name, map): entries = [" if (tag == \"{}\")\n return \"{}\"_s;".format(k, v) for k, v in map.items()] entries.sort() file.write("\n".join(entries)) file.write("\n return String();\n") def dumpSwitchLookup(self, file, name, map): tree = {} for k, v in map.items(): node = tree for char in k: if not (char in node): node[char] = {} node = node[char] node["value"] = v self.dumpSwitchLookupTree(file, tree, 0) file.write("\n return String();\n") def dumpSwitchLookupTree(self, file, tree, level): indent = "".ljust((level + 1) * 4) if "value" in tree: file.write(indent + "if (tag.length() == {})\n".format(level)) file.write(indent + " return \"{}\"_s;\n".format(tree["value"])) del tree["value"] keys = tree.keys() keys.sort() if len(keys) == 0: return file.write(indent + "switch (tag[{}]) {{\n".format(level)) for key in keys: file.write(indent + "case {}:\n".format(ord(key))) self.dumpSwitchLookupTree(file, tree[key], level + 1) file.write(indent + " break;\n") file.write(indent + "default: break;\n") file.write(indent + "}\n") if __name__ == "__main__": parser = optparse.OptionParser(usage="usage: %prog ") (options, args) = parser.parse_args() if len(args) != 2: parser.error(" ") registryPath = args[0] intlCanonHPath = args[1] registryFile = openOrExit(registryPath, "r") intlCanonHFile = openOrExit(intlCanonHPath, "wb") intlCanonHFile.write(header) registry = SubtagRegistry() registry.parse(registryFile) registry.dump(intlCanonHFile) intlCanonHFile.write(footer) exit(0)