#! /usr/bin/env python # Copyright (C) 2016 Apple Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # This tool processes the Unicode Character Database file CaseFolding.txt to create # canonicalization table as decribed in ECMAScript 6 standard in section # "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2. import optparse import os import re import sys header = """/* * Copyright (C) 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode #include "config.h" #include "YarrCanonicalize.h" namespace JSC { namespace Yarr { """ footer = """} } // JSC::Yarr """ MaxUnicode = 0x10ffff commonAndSimpleLinesRE = re.compile(r"(?P[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P[0-9A-F]+)", re.IGNORECASE) def openOrExit(path, mode): try: dirname = os.path.dirname(path) if not os.path.isdir(dirname): os.makedirs(dirname) if sys.version_info.major >= 3: return open(path, mode, encoding="UTF-8") else: return open(path, mode) except IOError as e: print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)) exit(1) class Canonicalize: def __init__(self): self.canonicalGroups = {}; def addMapping(self, code, mapping): if mapping not in self.canonicalGroups: self.canonicalGroups[mapping] = [] self.canonicalGroups[mapping].append(code) def readCaseFolding(self, file): codesSeen = set() for line in file: line = line.split('#', 1)[0] line = line.rstrip() if (not len(line)): continue fields = commonAndSimpleLinesRE.match(line) if (not fields): continue code = int(fields.group('code'), 16) mapping = int(fields.group('mapping'), 16) codesSeen.add(code) self.addMapping(code, mapping) for i in range(MaxUnicode + 1): if i in codesSeen: continue; self.addMapping(i, i) def createTables(self, file): typeInfo = [""] * (MaxUnicode + 1) characterSets = [] for mapping in sorted(self.canonicalGroups.keys()): characters = self.canonicalGroups[mapping] if len(characters) == 1: typeInfo[characters[0]] = "CanonicalizeUnique:0" else: characters.sort() if len(characters) > 2: for ch in characters: typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets) characterSets.append(characters) else: low = characters[0] high = characters[1] delta = high - low if delta == 1: type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0" typeInfo[low] = type typeInfo[high] = type else: typeInfo[low] = "CanonicalizeRangeLo:%d" % delta typeInfo[high] = "CanonicalizeRangeHi:%d" % delta rangeInfo = [] end = 0 while end <= MaxUnicode: begin = end type = typeInfo[end] while end < MaxUnicode and typeInfo[end + 1] == type: end = end + 1 rangeInfo.append({"begin": begin, "end": end, "type": type}) end = end + 1 for i in range(len(characterSets)): characters = "" cur_set = characterSets[i] for ch in cur_set: characters = characters + "0x{character:04x}, ".format(character=ch) file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters)) file.write("\n") file.write("static const size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets))) file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n") for i in range(len(characterSets)): file.write(" unicodeCharacterSet{setNumber:d},\n".format(setNumber=i)) file.write("};\n") file.write("\n") file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo))) file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n") for info in rangeInfo: typeAndValue = info["type"].split(":") file.write(" {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0])) file.write("};\n") file.write("\n") if __name__ == "__main__": parser = optparse.OptionParser(usage = "usage: %prog ") (options, args) = parser.parse_args() if len(args) != 2: parser.error(" ") caseFoldingTxtPath = args[0] canonicalizeHPath = args[1] caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r") canonicalizeHFile = openOrExit(canonicalizeHPath, "w") canonicalize = Canonicalize() canonicalize.readCaseFolding(caseFoldingTxtFile) canonicalizeHFile.write(header); canonicalize.createTables(canonicalizeHFile) canonicalizeHFile.write(footer); caseFoldingTxtFile.close() canonicalizeHFile.close() exit(0)