#!/usr/bin/env python # Copyright (C) 2017 Apple Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # This tool processes the Unicode Character Database file CaseFolding.txt to create # canonicalization table as decribed in ECMAScript 6 standard in section # "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2. import sys import copy import optparse import os import re from hasher import stringHash header = """/* * Copyright (C) 2017-2018 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // DO NO EDIT! - This file was generated by """ + __file__ + """ """ footer = """ """ RequiredUCDFiles = ["DerivedBinaryProperties.txt", "DerivedCoreProperties.txt", "DerivedNormalizationProps.txt", "PropList.txt", "PropertyAliases.txt", "PropertyValueAliases.txt", "ScriptExtensions.txt", "UnicodeData.txt", "emoji-data.txt"] UCDDirectoryPath = None SupportedBinaryProperties = [ "Alphabetic", "Any", "ASCII", "ASCII_Hex_Digit", "Assigned", "Bidi_Control", "Bidi_Mirrored", "Case_Ignorable", "Cased", "Changes_When_Casefolded", "Changes_When_Casemapped", "Changes_When_Lowercased", "Changes_When_NFKC_Casefolded", "Changes_When_Titlecased", "Changes_When_Uppercased", "Dash", "Default_Ignorable_Code_Point", "Deprecated", "Diacritic", "Emoji", "Emoji_Component", "Emoji_Modifier_Base", "Emoji_Modifier", "Emoji_Presentation", "Extended_Pictographic", "Extender", "Grapheme_Base", "Grapheme_Extend", "Hex_Digit", "ID_Continue", "ID_Start", "Ideographic", "IDS_Binary_Operator", "IDS_Trinary_Operator", "Join_Control", "Logical_Order_Exception", "Lowercase", "Math", "Noncharacter_Code_Point", "Pattern_Syntax", "Pattern_White_Space", "Quotation_Mark", "Radical", "Regional_Indicator", "Sentence_Terminal", "Soft_Dotted", "Terminal_Punctuation", "Unified_Ideograph", "Uppercase", "Variation_Selector", "White_Space", "XID_Continue", "XID_Start"] lastASCIICodePoint = 0x7f firstUnicodeCodePoint = 0x80 MaxUnicode = 0x10ffff MaxBMP = 0xffff commonAndSimpleLinesRE = re.compile(r"(?P[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P[0-9A-F]+)", re.IGNORECASE) aliases = None def openOrExit(path, mode): try: return open(path, mode) except IOError as e: print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)) exit(1) def openUCDFileOrExit(path): if not UCDDirectoryPath: exit(1) return openOrExit(os.path.join(UCDDirectoryPath, path), 'r') def verifyUCDFilesExist(): if not UCDDirectoryPath: exit(1) missingFileCount = 0 for file in RequiredUCDFiles: fullPath = os.path.join(UCDDirectoryPath, file) if not os.path.exists(fullPath): print("Couldn't find UCD file {0} at {1}".format(file, fullPath)) missingFileCount = missingFileCount + 1 if missingFileCount: exit(1) def ceilingToPowerOf2(size): powerOf2 = 1 while size > powerOf2: powerOf2 = powerOf2 << 1 return powerOf2 class Aliases: def __init__(self): self.globalNameToAliases = {} self.generalCategoryToAliases = {} self.aliasToGeneralCategory = {} self.scriptToAliases = {} self.aliasToScript = {} def parsePropertyAliasesFile(self, file): for line in file: line = line.split('#', 1)[0] line = line.rstrip() if (not len(line)): continue fields = line.split(';') if (not fields): continue aliases = [fields[0].strip()] fullName = fields[1].strip() for otherAlias in fields[2:]: aliases.append(otherAlias.strip()) if fullName in self.globalNameToAliases: print("Error, already an alias for {}".format(fullName)) else: self.globalNameToAliases[fullName] = aliases def parsePropertyValueAliasesFile(self, file): for line in file: line = line.split('#', 1)[0] line = line.rstrip() if (not len(line)): continue fields = line.split(';') if (not fields): continue propertyType = fields[0].strip() if propertyType == "gc": mapToModify = self.generalCategoryToAliases reverseMapToModify = self.aliasToGeneralCategory elif propertyType == "sc": mapToModify = self.scriptToAliases reverseMapToModify = self.aliasToScript else: continue primaryAlias = fields[1].strip() fullName = fields[2].strip() aliases = [primaryAlias] for otherAlias in fields[3:]: aliases.append(otherAlias.strip()) if fullName in mapToModify: print("Error, already an {} alias for {}".format(propertyType, fullName)) else: mapToModify[fullName] = aliases if reverseMapToModify != None: reverseMapToModify[primaryAlias] = fullName def globalAliasesFor(self, name): if name not in self.globalNameToAliases: return [] return self.globalNameToAliases[name] def generalCategoryAliasesFor(self, name): if name not in self.generalCategoryToAliases: return "" return self.generalCategoryToAliases[name] def generalCategoryForAlias(self, name): if name not in self.aliasToGeneralCategory: return "" return self.aliasToGeneralCategory[name] def scriptAliasesFor(self, name): if name not in self.scriptToAliases: return "" return self.scriptToAliases[name] def scriptNameForAlias(self, name): if name not in self.aliasToScript: return "" return self.aliasToScript[name] class PropertyData: allPropertyData = [] def __init__(self, name): self.name = name self.aliases = [] self.index = len(PropertyData.allPropertyData) self.hasNonBMPCharacters = False self.matches = [] self.ranges = [] self.unicodeMatches = [] self.unicodeRanges = [] self.codePointCount = 0 PropertyData.allPropertyData.append(self) def setAliases(self, aliases): self.aliases = aliases def makeCopy(self): result = copy.deepcopy(self) result.index = len(PropertyData.allPropertyData) PropertyData.allPropertyData.append(result) return result def getIndex(self): return self.index def getCreateFuncName(self): return "createCharacterClass{}".format(self.index) def addMatch(self, codePoint): if codePoint > MaxBMP: self.hasNonBMPCharacters = True if codePoint <= lastASCIICodePoint: if (len(self.matches) and self.matches[-1] > codePoint) or (len(self.ranges) and self.ranges[-1][1] > codePoint): self.addMatchUnordered(codePoint) return self.codePointCount = self.codePointCount + 1 if len(self.matches) and self.matches[-1] == (codePoint - 1): lowCodePoint = self.matches.pop() self.ranges.append((lowCodePoint, codePoint)) elif len(self.ranges) and self.ranges[-1][1] == (codePoint - 1): priorRange = self.ranges.pop() self.ranges.append((priorRange[0], codePoint)) else: self.matches.append(codePoint) else: if (len(self.unicodeMatches) and self.unicodeMatches[-1] > codePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > codePoint): self.addMatchUnordered(codePoint) return self.codePointCount = self.codePointCount + 1 if len(self.unicodeMatches) and self.unicodeMatches[-1] == (codePoint - 1): lowCodePoint = self.unicodeMatches.pop() self.unicodeRanges.append((lowCodePoint, codePoint)) elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (codePoint - 1): priorRange = self.unicodeRanges.pop() self.unicodeRanges.append((priorRange[0], codePoint)) else: self.unicodeMatches.append(codePoint) def addRange(self, lowCodePoint, highCodePoint): if highCodePoint > MaxBMP: self.hasNonBMPCharacters = True if highCodePoint <= lastASCIICodePoint: if (len(self.matches) and self.matches[-1] > lowCodePoint) or (len(self.ranges) and self.ranges[-1][1] > lowCodePoint): self.addRangeUnordered(lowCodePoint, highCodePoint) return self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 if len(self.matches) and self.matches[-1] == (lowCodePoint - 1): lowCodePoint = self.matches.pop() elif len(self.ranges) and self.ranges[-1][1] == (lowCodePoint - 1): priorRange = self.ranges.pop() lowCodePoint = priorRange[0] self.ranges.append((lowCodePoint, highCodePoint)) elif lowCodePoint <= lastASCIICodePoint: if lowCodePoint == lastASCIICodePoint: self.addMatch(lowCodePoint) else: self.addRange(lowCodePoint, lastASCIICodePoint) if highCodePoint == firstUnicodeCodePoint: self.addMatch(highCodePoint) else: self.addRange(firstUnicodeCodePoint, highCodePoint) else: if (len(self.unicodeMatches) and self.unicodeMatches[-1] > lowCodePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > lowCodePoint): self.addRangeUnordered(lowCodePoint, highCodePoint) return self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 if len(self.unicodeMatches) and self.unicodeMatches[-1] == (lowCodePoint - 1): lowCodePoint = self.unicodeMatches.pop() self.unicodeRanges.append((lowCodePoint, highCodePoint)) elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (lowCodePoint - 1): priorRange = self.unicodeRanges.pop() self.unicodeRanges.append((priorRange[0], highCodePoint)) else: self.unicodeRanges.append((lowCodePoint, highCodePoint)) def addMatchUnorderedForMatchesAndRanges(self, codePoint, matches, ranges): if codePoint in matches: return insertLocation = None lowCodePoint = None highCodePoint = None for idx in xrange(len(matches)): match = matches[idx] if codePoint == match + 1: lowCodePoint = match if idx < (len(matches) - 1) and codePoint == matches[idx + 1] - 1: highCodePoint = matches[idx + 1] del matches[idx + 1] self.codePointCount = self.codePointCount - 1 else: highCodePoint = codePoint del matches[idx] self.codePointCount = self.codePointCount - 1 break elif codePoint == match - 1: lowCodePoint = codePoint highCodePoint = match del matches[idx] self.codePointCount = self.codePointCount - 1 break elif codePoint < match: insertLocation = idx break if insertLocation is None: insertLocation = len(matches) if lowCodePoint is None: lowCodePoint = codePoint highCodePoint = codePoint for idx in xrange(len(ranges)): range = ranges[idx] if lowCodePoint >= range[0] and highCodePoint <= range[1]: return if lowCodePoint <= (range[1] + 1) and highCodePoint >= (range[0] - 1): while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1): range = ranges[idx] lowCodePoint = min(lowCodePoint, range[0]) highCodePoint = max(highCodePoint, range[1]) del ranges[idx] self.codePointCount = self.codePointCount - (range[1] - range[0]) - 1 ranges.insert(idx, (lowCodePoint, highCodePoint)) self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 return elif highCodePoint < range[0]: if lowCodePoint != highCodePoint: ranges.insert(idx, (lowCodePoint, highCodePoint)) self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 return break if lowCodePoint != highCodePoint: ranges.append((lowCodePoint, highCodePoint)) self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 return if insertLocation is not None: matches.insert(insertLocation, codePoint) self.codePointCount = self.codePointCount + 1 def addRangeUnorderedForMatchesAndRanges(self, lowCodePoint, highCodePoint, matches, ranges): if len(matches) and highCodePoint >= matches[0] and lowCodePoint <= matches[-1]: for idx in xrange(len(matches)): match = matches[idx] if lowCodePoint <= match and highCodePoint >= match: while idx < len(matches) and highCodePoint >= matches[idx]: del matches[idx] self.codePointCount = self.codePointCount - 1 if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1: highCodePoint = matches[idx + 1] del matches[idx + 1] self.codePointCount = self.codePointCount - 1 break elif lowCodePoint == match + 1: lowCodePoint = match while idx < len(matches) and highCodePoint >= matches[idx]: del matches[idx] self.codePointCount = self.codePointCount - 1 if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1: highCodePoint = matches[idx + 1] del matches[idx + 1] self.codePointCount = self.codePointCount - 1 break elif highCodePoint == match - 1: highCodePoint = match del matches[idx] self.codePointCount = self.codePointCount - 1 break elif highCodePoint < match: break for idx in xrange(len(ranges)): range = ranges[idx] if lowCodePoint >= range[0] and highCodePoint <= range[1]: return if lowCodePoint <= (range[1] + 1) and highCodePoint >= (range[0] - 1): while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1): range = ranges[idx] lowCodePoint = min(lowCodePoint, range[0]) highCodePoint = max(highCodePoint, range[1]) del ranges[idx] self.codePointCount = self.codePointCount - (range[1] - range[0]) - 1 ranges.insert(idx, (lowCodePoint, highCodePoint)) self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 return elif highCodePoint < range[0]: ranges.insert(idx, (lowCodePoint, highCodePoint)) self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 return ranges.append((lowCodePoint, highCodePoint)) self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1 def addMatchUnordered(self, codePoint): if codePoint <= lastASCIICodePoint: self.addMatchUnorderedForMatchesAndRanges(codePoint, self.matches, self.ranges) else: self.addMatchUnorderedForMatchesAndRanges(codePoint, self.unicodeMatches, self.unicodeRanges) def addRangeUnordered(self, lowCodePoint, highCodePoint): if highCodePoint <= lastASCIICodePoint: self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.matches, self.ranges) elif lowCodePoint >= firstUnicodeCodePoint: self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges) else: if lowCodePoint == lastASCIICodePoint: self.addMatchUnorderedForMatchesAndRanges(lowCodePoint, self.matches, self.ranges) else: self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, lastASCIICodePoint, self.unicodeMatches, self.ranges) if highCodePoint == firstUnicodeCodePoint: self.addMatchUnorderedForMatchesAndRanges(highCodePoint, self.unicodeMatches, self.unicodeRanges) else: self.addRangeUnorderedForMatchesAndRanges(firstUnicodeCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges) def removeMatchFromRanges(self, codePoint, ranges): for idx in xrange(len(ranges)): range = ranges[idx] if range[0] <= codePoint and codePoint <= range[1]: ranges.pop(idx) if range[0] < codePoint and codePoint < range[1]: lowRange = (range[0], codePoint - 1) highRange = (codePoint + 1, range[1]) # Since list.insert inserts before the index given, handle the high range first if highRange[0] == highRange[1]: self.addMatchUnordered(highRange[0]) else: ranges.insert(idx, highRange) if lowRange[0] == lowRange[1]: self.addMatchUnordered(lowRange[0]) else: ranges.insert(idx, lowRange) else: if range[0] == codePoint: range = (codePoint + 1, range[1]) else: range = (range[0], codePoint - 1) if range[0] == range[1]: self.addMatchUnordered(range[0]) else: ranges.insert(idx, range) self.codePointCount = self.codePointCount - 1 return def removeMatch(self, codePoint): if codePoint <= lastASCIICodePoint: if codePoint in self.matches: self.matches.remove(codePoint) self.codePointCount = self.codePointCount - 1 else: self.removeMatchFromRanges(codePoint, self.ranges) else: if codePoint in self.unicodeMatches: self.unicodeMatches.remove(codePoint) self.codePointCount = self.codePointCount - 1 else: self.removeMatchFromRanges(codePoint, self.unicodeRanges) def dumpMatchData(self, file, valuesPerLine, dataList, formatter): valuesThisLine = 0 firstValue = True file.write("{") for elem in dataList: if firstValue: firstValue = False else: file.write(", ") valuesThisLine = valuesThisLine + 1 if valuesThisLine > valuesPerLine: file.write("\n ") valuesThisLine = 1 formatter(file, elem) file.write("}") def dump(self, file, commaAfter): file.write("static std::unique_ptr {}()\n{{\n".format(self.getCreateFuncName())) file.write(" // Name = {}, number of codePoints: {}\n".format(self.name, self.codePointCount)) file.write(" auto characterClass = std::make_unique(\n") file.write(" std::initializer_list(") self.dumpMatchData(file, 8, self.matches, lambda file, match: (file.write("{0:0=#4x}".format(match)))) file.write("),\n") file.write(" std::initializer_list(") self.dumpMatchData(file, 4, self.ranges, lambda file, range: (file.write("{{{0:0=#4x}, {1:0=#4x}}}".format(range[0], range[1])))) file.write("),\n") file.write(" std::initializer_list(") self.dumpMatchData(file, 8, self.unicodeMatches, lambda file, match: (file.write("{0:0=#6x}".format(match)))) file.write("),\n") file.write(" std::initializer_list(") self.dumpMatchData(file, 4, self.unicodeRanges, lambda file, range: (file.write("{{{0:0=#6x}, {1:0=#6x}}}".format(range[0], range[1])))) file.write("));\n") file.write(" characterClass->m_hasNonBMPCharacters = {};\n".format(("false", "true")[self.hasNonBMPCharacters])) file.write(" return characterClass;\n}\n\n") @classmethod def dumpAll(cls, file): for propertyData in cls.allPropertyData: propertyData.dump(file, propertyData != cls.allPropertyData[-1]) file.write("typedef std::unique_ptr (*CreateCharacterClass)();\n") file.write("static CreateCharacterClass createFunctions[{}] = {{\n ".format(len(cls.allPropertyData))) functionsOnThisLine = 0 for propertyData in cls.allPropertyData: file.write(" {},".format(propertyData.getCreateFuncName())) functionsOnThisLine = functionsOnThisLine + 1 if functionsOnThisLine == 4: file.write("\n ") functionsOnThisLine = 0 file.write("};\n\n") @classmethod def createAndDumpHashTable(self, file, propertyDict, tablePrefix): propertyKeys = propertyDict.keys() numberOfKeys = len(propertyKeys) hashSize = ceilingToPowerOf2(numberOfKeys * 2) hashMask = hashSize - 1 hashTable = [None] * hashSize valueTable = [] tableSize = hashSize keyValuesToHash = [] for propertyName in propertyKeys: propertyData = propertyDict[propertyName] keyValuesToHash.append((propertyName, propertyData.getIndex())) for alias in propertyData.aliases: keyValuesToHash.append((alias, propertyData.getIndex())) for keyValue in keyValuesToHash: key = keyValue[0] hash = stringHash(key) % hashSize while hashTable[hash] is not None: if hashTable[hash][1] is not None: hash = hashTable[hash][1] else: hashTable[hash] = (hashTable[hash][0], tableSize) hashTable.append(None) hash = tableSize tableSize = tableSize + 1 hashTable[hash] = (len(valueTable), None) valueTable.append((key, keyValue[1])) file.write("static const struct HashIndex {}TableIndex[{}] = {{\n".format(tablePrefix, len(hashTable))) for tableIndex in hashTable: value = -1 next = -1 if tableIndex is not None: value = tableIndex[0] if tableIndex[1] is not None: next = tableIndex[1] file.write(" {{ {}, {} }},\n".format(value, next)) file.write("};\n\n") file.write("static const struct HashValue {}TableValue[{}] = {{\n".format(tablePrefix, len(valueTable))) for value in valueTable: file.write(" {{ \"{}\", {} }},\n".format(value[0], value[1])) file.write("};\n\n") file.write("static const struct HashTable {}HashTable = \n".format(tablePrefix)) file.write(" {{ {}, {}, {}TableValue, {}TableIndex }};\n\n".format(len(valueTable), hashMask, tablePrefix, tablePrefix)) class Scripts: def __init__(self): self.allPropertyData = [] self.scriptsByName = {} self.scriptExtensionsByName = {} self.unknownScript = PropertyData("Unknown") self.unknownScript.setAliases(aliases.scriptAliasesFor("Unknown")) self.allPropertyData.append(self.unknownScript) self.scriptsParsed = False def parseScriptsFile(self, file): currentScriptName = None currentPropertyData = None # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges, # sort them, and then go the list to create the inverse of the assigned ranges. assignedCodePointRanges = [] for line in file: line = line.split('#', 1)[0] line = line.rstrip() if (not len(line)): continue fields = line.split(';') if (not fields): continue codePoints = fields[0].strip() scriptName = fields[1].strip() if scriptName != currentScriptName: currentScriptName = scriptName currentPropertyData = PropertyData(scriptName) currentPropertyData.setAliases(aliases.scriptAliasesFor(scriptName)) self.allPropertyData.append(currentPropertyData) self.scriptsByName[scriptName] = currentPropertyData dotDot = codePoints.find("..") if dotDot == -1: codePoint = int(codePoints, 16) currentPropertyData.addMatch(codePoint) assignedCodePointRanges.append((codePoint, codePoint)) else: lowCodePoint = int(codePoints[:dotDot], 16) highCodePoint = int(codePoints[dotDot + 2:], 16) currentPropertyData.addRange(lowCodePoint, highCodePoint) assignedCodePointRanges.append((lowCodePoint, highCodePoint)) assignedCodePointRanges.sort(key=lambda range: range[0]) lastAssignedCodePoint = 0 for range in assignedCodePointRanges: if range[0] - lastAssignedCodePoint > 1: if range[0] - lastAssignedCodePoint == 2: self.unknownScript.addMatch(lastAssignedCodePoint + 1) else: self.unknownScript.addRange(lastAssignedCodePoint + 1, range[0] - 1) lastAssignedCodePoint = range[1] if lastAssignedCodePoint < MaxUnicode: if MaxUnicode - lastAssignedCodePoint == 1: self.unknownScript.addMatch(MaxUnicode) else: self.unknownScript.addRange(lastAssignedCodePoint + 1, MaxUnicode) self.scriptsParsed = True def parseScriptExtensionsFile(self, file): currentPropertyData = None # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges, # sort them, and then go the list to create the inverse of the assigned ranges. assignedCodePointRanges = [] if not self.scriptsParsed: print("Error: parsing ScriptExtensions.txt before Scripts.txt") exit(1) commonScriptExtenstionPropertyData = None inheritedScriptExtensionPropertyData = None scriptName = "Common" if scriptName in self.scriptsByName: commonScriptExtenstionPropertyData = self.scriptsByName[scriptName].makeCopy() else: commonScriptExtenstionPropertyData = PropertyData(scriptName) commonScriptExtenstionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName)) self.allPropertyData.append(commonScriptExtenstionPropertyData) self.scriptExtensionsByName[scriptName] = commonScriptExtenstionPropertyData scriptName = "Inherited" if scriptName in self.scriptsByName: inheritedScriptExtensionPropertyData = self.scriptsByName[scriptName].makeCopy() else: inheritedScriptExtensionPropertyData = PropertyData(scriptName) inheritedScriptExtensionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName)) self.allPropertyData.append(inheritedScriptExtensionPropertyData) self.scriptExtensionsByName[scriptName] = inheritedScriptExtensionPropertyData for line in file: line = line.split('#', 1)[0] line = line.rstrip() if (not len(line)): continue fields = line.split(';') if (not fields): continue codePoints = fields[0].strip() scriptAliasList = fields[1].strip() for scriptAlias in scriptAliasList.split(' '): scriptName = aliases.scriptNameForAlias(scriptAlias) currentPropertyData = None if scriptName not in self.scriptExtensionsByName: currentPropertyData = self.scriptsByName[scriptName].makeCopy() self.allPropertyData.append(currentPropertyData) self.scriptExtensionsByName[scriptName] = currentPropertyData else: currentPropertyData = self.scriptExtensionsByName[scriptName] dotDot = codePoints.find("..") if dotDot == -1: codePoint = int(codePoints, 16) currentPropertyData.addMatch(codePoint) commonScriptExtenstionPropertyData.removeMatch(codePoint) inheritedScriptExtensionPropertyData.removeMatch(codePoint) else: lowCodePoint = int(codePoints[:dotDot], 16) highCodePoint = int(codePoints[dotDot + 2:], 16) currentPropertyData.addRange(lowCodePoint, highCodePoint) for codePoint in xrange(lowCodePoint, highCodePoint + 1): commonScriptExtenstionPropertyData.removeMatch(codePoint) inheritedScriptExtensionPropertyData.removeMatch(codePoint) # For the scripts that don't have any additional extension codePoints, copy the script # data to the script extension with the same name for scriptName, propertyData in self.scriptsByName.iteritems(): if scriptName not in self.scriptExtensionsByName: self.scriptExtensionsByName[scriptName] = propertyData def dump(self, file): file.write("// Scripts:\n") PropertyData.createAndDumpHashTable(file, self.scriptsByName, "script") file.write("// Script_Extensions:\n") PropertyData.createAndDumpHashTable(file, self.scriptExtensionsByName, "scriptExtension") class GeneralCategory: def __init__(self, file): self.file = file self.allPropertyData = [] self.propertyDataByCategory = {} self.createSpecialPropertyData("Any", (0, MaxUnicode)) self.createSpecialPropertyData("ASCII", (0, lastASCIICodePoint)) self.assignedPropertyData = self.createSpecialPropertyData("Assigned") self.unassignedProperyData = self.findPropertyGroupFor("Cn")[1] self.casedLetterPropertyData = self.findPropertyGroupFor("LC")[1] self.lastAddedCodePoint = 0 def createSpecialPropertyData(self, name, range=None): propertyData = PropertyData(name) self.allPropertyData.append(propertyData) self.propertyDataByCategory[name] = propertyData if range: propertyData.addRange(range[0], range[1]) return propertyData def findPropertyGroupFor(self, categoryAlias): category = aliases.generalCategoryForAlias(categoryAlias) allCategoryAliases = aliases.generalCategoryAliasesFor(category) categoryGroupAlias = categoryAlias[0] categoryGroup = aliases.generalCategoryForAlias(categoryGroupAlias) allCategoryGroupAlias = aliases.generalCategoryAliasesFor(categoryGroup) groupPropertyData = None propertyData = None if categoryGroup not in self.propertyDataByCategory: groupPropertyData = PropertyData(categoryGroup) groupPropertyData.setAliases(allCategoryGroupAlias) self.allPropertyData.append(groupPropertyData) self.propertyDataByCategory[categoryGroup] = groupPropertyData else: groupPropertyData = self.propertyDataByCategory[categoryGroup] if category not in self.propertyDataByCategory: propertyData = PropertyData(category) propertyData.setAliases(allCategoryAliases) self.allPropertyData.append(propertyData) self.propertyDataByCategory[category] = propertyData else: propertyData = self.propertyDataByCategory[category] return (groupPropertyData, propertyData) def addNextCodePoints(self, categoryAlias, codePoint, highCodePoint=None): if codePoint - self.lastAddedCodePoint > 1: propertyDatas = self.findPropertyGroupFor("Cn") if codePoint - self.lastAddedCodePoint == 2: propertyDatas[0].addMatch(self.lastAddedCodePoint + 1) propertyDatas[1].addMatch(self.lastAddedCodePoint + 1) else: propertyDatas[0].addRange(self.lastAddedCodePoint + 1, codePoint - 1) propertyDatas[1].addRange(self.lastAddedCodePoint + 1, codePoint - 1) propertyDatas = self.findPropertyGroupFor(categoryAlias) if highCodePoint: propertyDatas[0].addRange(codePoint, highCodePoint) propertyDatas[1].addRange(codePoint, highCodePoint) if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu": self.casedLetterPropertyData.addRange(codePoint, highCodePoint) self.assignedPropertyData.addRange(codePoint, highCodePoint) self.lastAddedCodePoint = highCodePoint else: propertyDatas[0].addMatch(codePoint) propertyDatas[1].addMatch(codePoint) if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu": self.casedLetterPropertyData.addMatch(codePoint) self.assignedPropertyData.addMatch(codePoint) self.lastAddedCodePoint = codePoint def parse(self): lastLineFirstOfRange = None lastLineCodePoint = 0 for line in self.file: line = line.split('#', 1)[0] line = line.rstrip() if (not len(line)): continue fields = line.split(';') if (not fields): continue codePoint = int(fields[0].strip(), 16) description = fields[1].strip() categoryAlias = fields[2].strip() if lastLineFirstOfRange: if description[-5:-1] == "Last": self.addNextCodePoints(categoryAlias, lastLineFirstOfRange, codePoint) lastLineFirstOfRange = None continue else: print("Malformed First..Last pair of lines") if description[-6:-1] == "First": lastLineFirstOfRange = codePoint continue self.addNextCodePoints(categoryAlias, codePoint) if self.lastAddedCodePoint < MaxUnicode: propertyDatas = self.findPropertyGroupFor("Cn") if MaxUnicode - self.lastAddedCodePoint == 1: propertyDatas[0].addMatch(MaxUnicode) propertyDatas[1].addMatch(MaxUnicode) else: propertyDatas[0].addRange(self.lastAddedCodePoint + 1, MaxUnicode) propertyDatas[1].addRange(self.lastAddedCodePoint + 1, MaxUnicode) def dump(self, file): file.write("// General_Category:\n") PropertyData.createAndDumpHashTable(file, self.propertyDataByCategory, "generalCategory") class BinaryProperty: def __init__(self): self.allPropertyData = [] self.propertyDataByProperty = {} def parsePropertyFile(self, file): currentPropertyName = None currentPropertyData = None for line in file: line = line.split('#', 1)[0] line = line.rstrip() if (not len(line)): continue fields = line.split(';') if (not fields): continue codePoints = fields[0].strip() propertyName = fields[1].strip() if propertyName != currentPropertyName: if propertyName not in SupportedBinaryProperties: continue currentPropertyName = propertyName currentPropertyData = PropertyData(propertyName) currentPropertyData.setAliases(aliases.globalAliasesFor(propertyName)) self.allPropertyData.append(currentPropertyData) self.propertyDataByProperty[propertyName] = currentPropertyData dotDot = codePoints.find("..") if dotDot == -1: currentPropertyData.addMatch(int(codePoints, 16)) else: currentPropertyData.addRange(int(codePoints[:dotDot], 16), int(codePoints[dotDot + 2:], 16)) def dump(self, file): file.write("// binary properties:\n") PropertyData.createAndDumpHashTable(file, self.propertyDataByProperty, "binaryProperty") if __name__ == "__main__": parser = optparse.OptionParser(usage="usage: %prog ") (options, args) = parser.parse_args() if len(args) != 2: parser.error(" ") UCDDirectoryPath = args[0] unicodeProertyDataHPath = args[1] verifyUCDFilesExist() propertyAliasesFile = openUCDFileOrExit("PropertyAliases.txt") propertyValueAliasesFile = openUCDFileOrExit("PropertyValueAliases.txt") scriptsFile = openUCDFileOrExit("Scripts.txt") scriptExtensionsFile = openUCDFileOrExit("ScriptExtensions.txt") unicodeDataFile = openUCDFileOrExit("UnicodeData.txt") derivedBinaryPropertiesFile = openUCDFileOrExit("DerivedBinaryProperties.txt") derivedCorePropertiesFile = openUCDFileOrExit("DerivedCoreProperties.txt") derivedNormalizationPropertiesFile = openUCDFileOrExit("DerivedNormalizationProps.txt") propListFile = openUCDFileOrExit("PropList.txt") emojiDataFile = openUCDFileOrExit("emoji-data.txt") aliases = Aliases() propertyDataHFile = openOrExit(unicodeProertyDataHPath, "wb") propertyDataHFile.write(header) aliases.parsePropertyAliasesFile(propertyAliasesFile) aliases.parsePropertyValueAliasesFile(propertyValueAliasesFile) generalCategory = GeneralCategory(unicodeDataFile) generalCategory.parse() binaryProperty = BinaryProperty() binaryProperty.parsePropertyFile(derivedBinaryPropertiesFile) binaryProperty.parsePropertyFile(derivedCorePropertiesFile) binaryProperty.parsePropertyFile(derivedNormalizationPropertiesFile) binaryProperty.parsePropertyFile(propListFile) binaryProperty.parsePropertyFile(emojiDataFile) scripts = Scripts() scripts.parseScriptsFile(scriptsFile) scripts.parseScriptExtensionsFile(scriptExtensionsFile) PropertyData.dumpAll(propertyDataHFile) generalCategory.dump(propertyDataHFile) binaryProperty.dump(propertyDataHFile) scripts.dump(propertyDataHFile) propertyDataHFile.write(footer) exit(0)