generateYarrUnicodePropertyTables.py [plain text]

#!/usr/bin/env python

# Copyright (C) 2017 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1.  Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer.
# 2.  Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# This tool processes the Unicode Character Database file CaseFolding.txt to create
# canonicalization table as decribed in ECMAScript 6 standard in section
# "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.

import sys
import copy
import optparse
import os
import re
from hasher import stringHash

header = """/*
* Copyright (C) 2017-2018 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1.  Redistributions of source code must retain the above copyright
*     notice, this list of conditions and the following disclaimer.
* 2.  Redistributions in binary form must reproduce the above copyright
*     notice, this list of conditions and the following disclaimer in the
*     documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

// DO NO EDIT! - This file was generated by """ + __file__ + """
"""


footer = """
"""

RequiredUCDFiles = ["DerivedBinaryProperties.txt", "DerivedCoreProperties.txt", "DerivedNormalizationProps.txt", "PropList.txt", "PropertyAliases.txt", "PropertyValueAliases.txt", "ScriptExtensions.txt", "UnicodeData.txt", "emoji-data.txt"]
UCDDirectoryPath = None

SupportedBinaryProperties = [
    "Alphabetic", "Any", "ASCII", "ASCII_Hex_Digit", "Assigned", "Bidi_Control", "Bidi_Mirrored", "Case_Ignorable",
    "Cased", "Changes_When_Casefolded", "Changes_When_Casemapped", "Changes_When_Lowercased", "Changes_When_NFKC_Casefolded",
    "Changes_When_Titlecased", "Changes_When_Uppercased", "Dash", "Default_Ignorable_Code_Point", "Deprecated",
    "Diacritic", "Emoji", "Emoji_Component", "Emoji_Modifier_Base", "Emoji_Modifier", "Emoji_Presentation",
    "Extended_Pictographic", "Extender", "Grapheme_Base", "Grapheme_Extend", "Hex_Digit", "ID_Continue", "ID_Start",
    "Ideographic", "IDS_Binary_Operator", "IDS_Trinary_Operator", "Join_Control", "Logical_Order_Exception", "Lowercase",
    "Math", "Noncharacter_Code_Point", "Pattern_Syntax", "Pattern_White_Space", "Quotation_Mark", "Radical",
    "Regional_Indicator", "Sentence_Terminal", "Soft_Dotted", "Terminal_Punctuation", "Unified_Ideograph", "Uppercase",
    "Variation_Selector", "White_Space", "XID_Continue", "XID_Start"]

lastASCIICodePoint = 0x7f
firstUnicodeCodePoint = 0x80
MaxUnicode = 0x10ffff
MaxBMP = 0xffff
commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE)
aliases = None


def openOrExit(path, mode):
    try:
        return open(path, mode)
    except IOError as e:
        print("I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror))
        exit(1)


def openUCDFileOrExit(path):
    if not UCDDirectoryPath:
        exit(1)

    return openOrExit(os.path.join(UCDDirectoryPath, path), 'r')


def verifyUCDFilesExist():
    if not UCDDirectoryPath:
        exit(1)

    missingFileCount = 0
    for file in RequiredUCDFiles:
        fullPath = os.path.join(UCDDirectoryPath, file)
        if not os.path.exists(fullPath):
            print("Couldn't find UCD file {0} at {1}".format(file, fullPath))
            missingFileCount = missingFileCount + 1
    if missingFileCount:
        exit(1)


def ceilingToPowerOf2(size):
    powerOf2 = 1
    while size > powerOf2:
        powerOf2 = powerOf2 << 1

    return powerOf2


class Aliases:
    def __init__(self):
        self.globalNameToAliases = {}
        self.generalCategoryToAliases = {}
        self.aliasToGeneralCategory = {}
        self.scriptToAliases = {}
        self.aliasToScript = {}

    def parsePropertyAliasesFile(self, file):
        for line in file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = line.split(';')
            if (not fields):
                continue

            aliases = [fields[0].strip()]
            fullName = fields[1].strip()
            for otherAlias in fields[2:]:
                aliases.append(otherAlias.strip())

            if fullName in self.globalNameToAliases:
                print("Error, already an alias for {}".format(fullName))
            else:
                self.globalNameToAliases[fullName] = aliases

    def parsePropertyValueAliasesFile(self, file):
        for line in file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = line.split(';')
            if (not fields):
                continue

            propertyType = fields[0].strip()

            if propertyType == "gc":
                mapToModify = self.generalCategoryToAliases
                reverseMapToModify = self.aliasToGeneralCategory
            elif propertyType == "sc":
                mapToModify = self.scriptToAliases
                reverseMapToModify = self.aliasToScript
            else:
                continue

            primaryAlias = fields[1].strip()
            fullName = fields[2].strip()
            aliases = [primaryAlias]
            for otherAlias in fields[3:]:
                aliases.append(otherAlias.strip())

            if fullName in mapToModify:
                print("Error, already an {} alias for {}".format(propertyType, fullName))
            else:
                mapToModify[fullName] = aliases
                if reverseMapToModify != None:
                    reverseMapToModify[primaryAlias] = fullName

    def globalAliasesFor(self, name):
        if name not in self.globalNameToAliases:
            return []
        return self.globalNameToAliases[name]

    def generalCategoryAliasesFor(self, name):
        if name not in self.generalCategoryToAliases:
            return ""
        return self.generalCategoryToAliases[name]

    def generalCategoryForAlias(self, name):
        if name not in self.aliasToGeneralCategory:
            return ""
        return self.aliasToGeneralCategory[name]

    def scriptAliasesFor(self, name):
        if name not in self.scriptToAliases:
            return ""
        return self.scriptToAliases[name]

    def scriptNameForAlias(self, name):
        if name not in self.aliasToScript:
            return ""
        return self.aliasToScript[name]


class PropertyData:
    allPropertyData = []

    def __init__(self, name):
        self.name = name
        self.aliases = []
        self.index = len(PropertyData.allPropertyData)
        self.hasNonBMPCharacters = False
        self.matches = []
        self.ranges = []
        self.unicodeMatches = []
        self.unicodeRanges = []
        self.codePointCount = 0
        PropertyData.allPropertyData.append(self)

    def setAliases(self, aliases):
        self.aliases = aliases

    def makeCopy(self):
        result = copy.deepcopy(self)
        result.index = len(PropertyData.allPropertyData)
        PropertyData.allPropertyData.append(result)
        return result

    def getIndex(self):
        return self.index

    def getCreateFuncName(self):
        return "createCharacterClass{}".format(self.index)

    def addMatch(self, codePoint):
        if codePoint > MaxBMP:
            self.hasNonBMPCharacters = True
        if codePoint <= lastASCIICodePoint:
            if (len(self.matches) and self.matches[-1] > codePoint) or (len(self.ranges) and self.ranges[-1][1] > codePoint):
                self.addMatchUnordered(codePoint)
                return

            self.codePointCount = self.codePointCount + 1
            if len(self.matches) and self.matches[-1] == (codePoint - 1):
                lowCodePoint = self.matches.pop()
                self.ranges.append((lowCodePoint, codePoint))
            elif len(self.ranges) and self.ranges[-1][1] == (codePoint - 1):
                priorRange = self.ranges.pop()
                self.ranges.append((priorRange[0], codePoint))
            else:
                self.matches.append(codePoint)
        else:
            if (len(self.unicodeMatches) and self.unicodeMatches[-1] > codePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > codePoint):
                self.addMatchUnordered(codePoint)
                return

            self.codePointCount = self.codePointCount + 1
            if len(self.unicodeMatches) and self.unicodeMatches[-1] == (codePoint - 1):
                lowCodePoint = self.unicodeMatches.pop()
                self.unicodeRanges.append((lowCodePoint, codePoint))
            elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (codePoint - 1):
                priorRange = self.unicodeRanges.pop()
                self.unicodeRanges.append((priorRange[0], codePoint))
            else:
                self.unicodeMatches.append(codePoint)

    def addRange(self, lowCodePoint, highCodePoint):
        if highCodePoint > MaxBMP:
            self.hasNonBMPCharacters = True
        if highCodePoint <= lastASCIICodePoint:
            if (len(self.matches) and self.matches[-1] > lowCodePoint) or (len(self.ranges) and self.ranges[-1][1] > lowCodePoint):
                self.addRangeUnordered(lowCodePoint, highCodePoint)
                return

            self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
            if len(self.matches) and self.matches[-1] == (lowCodePoint - 1):
                lowCodePoint = self.matches.pop()
            elif len(self.ranges) and self.ranges[-1][1] == (lowCodePoint - 1):
                priorRange = self.ranges.pop()
                lowCodePoint = priorRange[0]
            self.ranges.append((lowCodePoint, highCodePoint))
        elif lowCodePoint <= lastASCIICodePoint:
            if lowCodePoint == lastASCIICodePoint:
                self.addMatch(lowCodePoint)
            else:
                self.addRange(lowCodePoint, lastASCIICodePoint)
            if highCodePoint == firstUnicodeCodePoint:
                self.addMatch(highCodePoint)
            else:
                self.addRange(firstUnicodeCodePoint, highCodePoint)
        else:
            if (len(self.unicodeMatches) and self.unicodeMatches[-1] > lowCodePoint) or (len(self.unicodeRanges) and self.unicodeRanges[-1][1] > lowCodePoint):
                self.addRangeUnordered(lowCodePoint, highCodePoint)
                return

            self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
            if len(self.unicodeMatches) and self.unicodeMatches[-1] == (lowCodePoint - 1):
                lowCodePoint = self.unicodeMatches.pop()
                self.unicodeRanges.append((lowCodePoint, highCodePoint))
            elif len(self.unicodeRanges) and self.unicodeRanges[-1][1] == (lowCodePoint - 1):
                priorRange = self.unicodeRanges.pop()
                self.unicodeRanges.append((priorRange[0], highCodePoint))
            else:
                self.unicodeRanges.append((lowCodePoint, highCodePoint))

    def addMatchUnorderedForMatchesAndRanges(self, codePoint, matches, ranges):
        if codePoint in matches:
            return
        insertLocation = None
        lowCodePoint = None
        highCodePoint = None
        for idx in xrange(len(matches)):
            match = matches[idx]
            if codePoint == match + 1:
                lowCodePoint = match
                if idx < (len(matches) - 1) and codePoint == matches[idx + 1] - 1:
                    highCodePoint = matches[idx + 1]
                    del matches[idx + 1]
                    self.codePointCount = self.codePointCount - 1
                else:
                    highCodePoint = codePoint
                del matches[idx]
                self.codePointCount = self.codePointCount - 1
                break
            elif codePoint == match - 1:
                lowCodePoint = codePoint
                highCodePoint = match
                del matches[idx]
                self.codePointCount = self.codePointCount - 1
                break
            elif codePoint < match:
                insertLocation = idx
                break

        if insertLocation is None:
            insertLocation = len(matches)
        if lowCodePoint is None:
            lowCodePoint = codePoint
            highCodePoint = codePoint

        for idx in xrange(len(ranges)):
            range = ranges[idx]
            if lowCodePoint >= range[0] and highCodePoint <= range[1]:
                return
            if lowCodePoint <= (range[1] + 1) and highCodePoint >= (range[0] - 1):
                while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
                    range = ranges[idx]
                    lowCodePoint = min(lowCodePoint, range[0])
                    highCodePoint = max(highCodePoint, range[1])
                    del ranges[idx]
                    self.codePointCount = self.codePointCount - (range[1] - range[0]) - 1

                ranges.insert(idx, (lowCodePoint, highCodePoint))
                self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                return
            elif highCodePoint < range[0]:
                if lowCodePoint != highCodePoint:
                    ranges.insert(idx, (lowCodePoint, highCodePoint))
                    self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                    return
                break

        if lowCodePoint != highCodePoint:
            ranges.append((lowCodePoint, highCodePoint))
            self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
            return

        if insertLocation is not None:
            matches.insert(insertLocation, codePoint)
            self.codePointCount = self.codePointCount + 1

    def addRangeUnorderedForMatchesAndRanges(self, lowCodePoint, highCodePoint, matches, ranges):
        if len(matches) and highCodePoint >= matches[0] and lowCodePoint <= matches[-1]:
            for idx in xrange(len(matches)):
                match = matches[idx]
                if lowCodePoint <= match and highCodePoint >= match:
                    while idx < len(matches) and highCodePoint >= matches[idx]:
                        del matches[idx]
                        self.codePointCount = self.codePointCount - 1
                    if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
                        highCodePoint = matches[idx + 1]
                        del matches[idx + 1]
                        self.codePointCount = self.codePointCount - 1
                    break
                elif lowCodePoint == match + 1:
                    lowCodePoint = match
                    while idx < len(matches) and highCodePoint >= matches[idx]:
                        del matches[idx]
                        self.codePointCount = self.codePointCount - 1

                    if idx < (len(matches) - 1) and highCodePoint == matches[idx + 1] - 1:
                        highCodePoint = matches[idx + 1]
                        del matches[idx + 1]
                        self.codePointCount = self.codePointCount - 1
                    break
                elif highCodePoint == match - 1:
                    highCodePoint = match
                    del matches[idx]
                    self.codePointCount = self.codePointCount - 1
                    break
                elif highCodePoint < match:
                    break

        for idx in xrange(len(ranges)):
            range = ranges[idx]
            if lowCodePoint >= range[0] and highCodePoint <= range[1]:
                return
            if lowCodePoint <= (range[1] + 1) and highCodePoint >= (range[0] - 1):
                while idx < len(ranges) and highCodePoint >= (ranges[idx][0] - 1):
                    range = ranges[idx]
                    lowCodePoint = min(lowCodePoint, range[0])
                    highCodePoint = max(highCodePoint, range[1])
                    del ranges[idx]
                    self.codePointCount = self.codePointCount - (range[1] - range[0]) - 1

                ranges.insert(idx, (lowCodePoint, highCodePoint))
                self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                return
            elif highCodePoint < range[0]:
                ranges.insert(idx, (lowCodePoint, highCodePoint))
                self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1
                return

        ranges.append((lowCodePoint, highCodePoint))
        self.codePointCount = self.codePointCount + (highCodePoint - lowCodePoint) + 1

    def addMatchUnordered(self, codePoint):
        if codePoint <= lastASCIICodePoint:
            self.addMatchUnorderedForMatchesAndRanges(codePoint, self.matches, self.ranges)
        else:
            self.addMatchUnorderedForMatchesAndRanges(codePoint, self.unicodeMatches, self.unicodeRanges)

    def addRangeUnordered(self, lowCodePoint, highCodePoint):
        if highCodePoint <= lastASCIICodePoint:
            self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.matches, self.ranges)
        elif lowCodePoint >= firstUnicodeCodePoint:
            self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)
        else:
            if lowCodePoint == lastASCIICodePoint:
                self.addMatchUnorderedForMatchesAndRanges(lowCodePoint, self.matches, self.ranges)
            else:
                self.addRangeUnorderedForMatchesAndRanges(lowCodePoint, lastASCIICodePoint, self.unicodeMatches, self.ranges)
            if highCodePoint == firstUnicodeCodePoint:
                self.addMatchUnorderedForMatchesAndRanges(highCodePoint, self.unicodeMatches, self.unicodeRanges)
            else:
                self.addRangeUnorderedForMatchesAndRanges(firstUnicodeCodePoint, highCodePoint, self.unicodeMatches, self.unicodeRanges)

    def removeMatchFromRanges(self, codePoint, ranges):
        for idx in xrange(len(ranges)):
            range = ranges[idx]
            if range[0] <= codePoint and codePoint <= range[1]:
                ranges.pop(idx)
                if range[0] < codePoint and codePoint < range[1]:
                    lowRange = (range[0], codePoint - 1)
                    highRange = (codePoint + 1, range[1])
                    # Since list.insert inserts before the index given, handle the high range first
                    if highRange[0] == highRange[1]:
                        self.addMatchUnordered(highRange[0])
                    else:
                        ranges.insert(idx, highRange)
                    if lowRange[0] == lowRange[1]:
                        self.addMatchUnordered(lowRange[0])
                    else:
                        ranges.insert(idx, lowRange)
                else:
                    if range[0] == codePoint:
                        range = (codePoint + 1, range[1])
                    else:
                        range = (range[0], codePoint - 1)
                    if range[0] == range[1]:
                        self.addMatchUnordered(range[0])
                    else:
                        ranges.insert(idx, range)
                self.codePointCount = self.codePointCount - 1
                return

    def removeMatch(self, codePoint):
        if codePoint <= lastASCIICodePoint:
            if codePoint in self.matches:
                self.matches.remove(codePoint)
                self.codePointCount = self.codePointCount - 1
            else:
                self.removeMatchFromRanges(codePoint, self.ranges)
        else:
            if codePoint in self.unicodeMatches:
                self.unicodeMatches.remove(codePoint)
                self.codePointCount = self.codePointCount - 1
            else:
                self.removeMatchFromRanges(codePoint, self.unicodeRanges)

    def dumpMatchData(self, file, valuesPerLine, dataList, formatter):
        valuesThisLine = 0
        firstValue = True

        file.write("{")
        for elem in dataList:
            if firstValue:
                firstValue = False
            else:
                file.write(", ")
            valuesThisLine = valuesThisLine + 1
            if valuesThisLine > valuesPerLine:
                file.write("\n                 ")
                valuesThisLine = 1
            formatter(file, elem)
        file.write("}")

    def dump(self, file, commaAfter):
        file.write("static std::unique_ptr<CharacterClass> {}()\n{{\n".format(self.getCreateFuncName()))
        file.write("    // Name = {}, number of codePoints: {}\n".format(self.name, self.codePointCount))
        file.write("    auto characterClass = std::make_unique<CharacterClass>(\n")
        file.write("        std::initializer_list<UChar32>(")
        self.dumpMatchData(file, 8, self.matches, lambda file, match: (file.write("{0:0=#4x}".format(match))))
        file.write("),\n")
        file.write("        std::initializer_list<CharacterRange>(")
        self.dumpMatchData(file, 4, self.ranges, lambda file, range: (file.write("{{{0:0=#4x}, {1:0=#4x}}}".format(range[0], range[1]))))
        file.write("),\n")
        file.write("        std::initializer_list<UChar32>(")
        self.dumpMatchData(file, 8, self.unicodeMatches, lambda file, match: (file.write("{0:0=#6x}".format(match))))
        file.write("),\n")
        file.write("        std::initializer_list<CharacterRange>(")
        self.dumpMatchData(file, 4, self.unicodeRanges, lambda file, range: (file.write("{{{0:0=#6x}, {1:0=#6x}}}".format(range[0], range[1]))))
        file.write("));\n")

        file.write("    characterClass->m_hasNonBMPCharacters = {};\n".format(("false", "true")[self.hasNonBMPCharacters]))
        file.write("    return characterClass;\n}\n\n")

    @classmethod
    def dumpAll(cls, file):
        for propertyData in cls.allPropertyData:
            propertyData.dump(file, propertyData != cls.allPropertyData[-1])

        file.write("typedef std::unique_ptr<CharacterClass> (*CreateCharacterClass)();\n")
        file.write("static CreateCharacterClass createFunctions[{}] = {{\n   ".format(len(cls.allPropertyData)))
        functionsOnThisLine = 0
        for propertyData in cls.allPropertyData:
            file.write(" {},".format(propertyData.getCreateFuncName()))
            functionsOnThisLine = functionsOnThisLine + 1
            if functionsOnThisLine == 4:
                file.write("\n   ")
                functionsOnThisLine = 0

        file.write("};\n\n")

    @classmethod
    def createAndDumpHashTable(self, file, propertyDict, tablePrefix):
        propertyKeys = propertyDict.keys()
        numberOfKeys = len(propertyKeys)
        hashSize = ceilingToPowerOf2(numberOfKeys * 2)
        hashMask = hashSize - 1
        hashTable = [None] * hashSize
        valueTable = []
        tableSize = hashSize

        keyValuesToHash = []
        for propertyName in propertyKeys:
            propertyData = propertyDict[propertyName]
            keyValuesToHash.append((propertyName, propertyData.getIndex()))
            for alias in propertyData.aliases:
                keyValuesToHash.append((alias, propertyData.getIndex()))

        for keyValue in keyValuesToHash:
            key = keyValue[0]
            hash = stringHash(key) % hashSize
            while hashTable[hash] is not None:
                if hashTable[hash][1] is not None:
                    hash = hashTable[hash][1]
                else:
                    hashTable[hash] = (hashTable[hash][0], tableSize)
                    hashTable.append(None)
                    hash = tableSize
                    tableSize = tableSize + 1

            hashTable[hash] = (len(valueTable), None)
            valueTable.append((key, keyValue[1]))

        file.write("static const struct HashIndex {}TableIndex[{}] = {{\n".format(tablePrefix, len(hashTable)))

        for tableIndex in hashTable:
            value = -1
            next = -1
            if tableIndex is not None:
                value = tableIndex[0]
                if tableIndex[1] is not None:
                    next = tableIndex[1]

            file.write("    {{ {}, {} }},\n".format(value, next))

        file.write("};\n\n")

        file.write("static const struct HashValue {}TableValue[{}] = {{\n".format(tablePrefix, len(valueTable)))
        for value in valueTable:
            file.write("    {{ \"{}\", {} }},\n".format(value[0], value[1]))
        file.write("};\n\n")

        file.write("static const struct HashTable {}HashTable = \n".format(tablePrefix))
        file.write("    {{ {}, {}, {}TableValue, {}TableIndex }};\n\n".format(len(valueTable), hashMask, tablePrefix, tablePrefix))


class Scripts:
    def __init__(self):
        self.allPropertyData = []
        self.scriptsByName = {}
        self.scriptExtensionsByName = {}
        self.unknownScript = PropertyData("Unknown")
        self.unknownScript.setAliases(aliases.scriptAliasesFor("Unknown"))
        self.allPropertyData.append(self.unknownScript)
        self.scriptsParsed = False

    def parseScriptsFile(self, file):
        currentScriptName = None
        currentPropertyData = None
        # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
        # sort them, and then go the list to create the inverse of the assigned ranges.
        assignedCodePointRanges = []

        for line in file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = line.split(';')
            if (not fields):
                continue

            codePoints = fields[0].strip()
            scriptName = fields[1].strip()

            if scriptName != currentScriptName:
                currentScriptName = scriptName
                currentPropertyData = PropertyData(scriptName)
                currentPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
                self.allPropertyData.append(currentPropertyData)
                self.scriptsByName[scriptName] = currentPropertyData

            dotDot = codePoints.find("..")
            if dotDot == -1:
                codePoint = int(codePoints, 16)
                currentPropertyData.addMatch(codePoint)
                assignedCodePointRanges.append((codePoint, codePoint))
            else:
                lowCodePoint = int(codePoints[:dotDot], 16)
                highCodePoint = int(codePoints[dotDot + 2:], 16)
                currentPropertyData.addRange(lowCodePoint, highCodePoint)
                assignedCodePointRanges.append((lowCodePoint, highCodePoint))

        assignedCodePointRanges.sort(key=lambda range: range[0])
        lastAssignedCodePoint = 0

        for range in assignedCodePointRanges:
            if range[0] - lastAssignedCodePoint > 1:
                if range[0] - lastAssignedCodePoint == 2:
                    self.unknownScript.addMatch(lastAssignedCodePoint + 1)
                else:
                    self.unknownScript.addRange(lastAssignedCodePoint + 1, range[0] - 1)
            lastAssignedCodePoint = range[1]

        if lastAssignedCodePoint < MaxUnicode:
            if MaxUnicode - lastAssignedCodePoint == 1:
                self.unknownScript.addMatch(MaxUnicode)
            else:
                self.unknownScript.addRange(lastAssignedCodePoint + 1, MaxUnicode)

        self.scriptsParsed = True

    def parseScriptExtensionsFile(self, file):
        currentPropertyData = None
        # To calculate the Unknown, we gather all the codePoints asigned to a scripts as ranges,
        # sort them, and then go the list to create the inverse of the assigned ranges.
        assignedCodePointRanges = []

        if not self.scriptsParsed:
            print("Error: parsing ScriptExtensions.txt before Scripts.txt")
            exit(1)

        commonScriptExtenstionPropertyData = None
        inheritedScriptExtensionPropertyData = None

        scriptName = "Common"
        if scriptName in self.scriptsByName:
            commonScriptExtenstionPropertyData = self.scriptsByName[scriptName].makeCopy()
        else:
            commonScriptExtenstionPropertyData = PropertyData(scriptName)
            commonScriptExtenstionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
        self.allPropertyData.append(commonScriptExtenstionPropertyData)
        self.scriptExtensionsByName[scriptName] = commonScriptExtenstionPropertyData

        scriptName = "Inherited"
        if scriptName in self.scriptsByName:
            inheritedScriptExtensionPropertyData = self.scriptsByName[scriptName].makeCopy()
        else:
            inheritedScriptExtensionPropertyData = PropertyData(scriptName)
            inheritedScriptExtensionPropertyData.setAliases(aliases.scriptAliasesFor(scriptName))
        self.allPropertyData.append(inheritedScriptExtensionPropertyData)
        self.scriptExtensionsByName[scriptName] = inheritedScriptExtensionPropertyData

        for line in file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = line.split(';')
            if (not fields):
                continue

            codePoints = fields[0].strip()
            scriptAliasList = fields[1].strip()

            for scriptAlias in scriptAliasList.split(' '):
                scriptName = aliases.scriptNameForAlias(scriptAlias)
                currentPropertyData = None

                if scriptName not in self.scriptExtensionsByName:
                    currentPropertyData = self.scriptsByName[scriptName].makeCopy()
                    self.allPropertyData.append(currentPropertyData)
                    self.scriptExtensionsByName[scriptName] = currentPropertyData
                else:
                    currentPropertyData = self.scriptExtensionsByName[scriptName]

                dotDot = codePoints.find("..")
                if dotDot == -1:
                    codePoint = int(codePoints, 16)
                    currentPropertyData.addMatch(codePoint)
                    commonScriptExtenstionPropertyData.removeMatch(codePoint)
                    inheritedScriptExtensionPropertyData.removeMatch(codePoint)
                else:
                    lowCodePoint = int(codePoints[:dotDot], 16)
                    highCodePoint = int(codePoints[dotDot + 2:], 16)
                    currentPropertyData.addRange(lowCodePoint, highCodePoint)
                    for codePoint in xrange(lowCodePoint, highCodePoint + 1):
                        commonScriptExtenstionPropertyData.removeMatch(codePoint)
                        inheritedScriptExtensionPropertyData.removeMatch(codePoint)

        # For the scripts that don't have any additional extension codePoints, copy the script
        # data to the script extension with the same name
        for scriptName, propertyData in self.scriptsByName.iteritems():
            if scriptName not in self.scriptExtensionsByName:
                self.scriptExtensionsByName[scriptName] = propertyData

    def dump(self, file):
        file.write("// Scripts:\n")
        PropertyData.createAndDumpHashTable(file, self.scriptsByName, "script")

        file.write("// Script_Extensions:\n")
        PropertyData.createAndDumpHashTable(file, self.scriptExtensionsByName, "scriptExtension")


class GeneralCategory:
    def __init__(self, file):
        self.file = file
        self.allPropertyData = []
        self.propertyDataByCategory = {}
        self.createSpecialPropertyData("Any", (0, MaxUnicode))
        self.createSpecialPropertyData("ASCII", (0, lastASCIICodePoint))
        self.assignedPropertyData = self.createSpecialPropertyData("Assigned")
        self.unassignedProperyData = self.findPropertyGroupFor("Cn")[1]
        self.casedLetterPropertyData = self.findPropertyGroupFor("LC")[1]
        self.lastAddedCodePoint = 0

    def createSpecialPropertyData(self, name, range=None):
        propertyData = PropertyData(name)
        self.allPropertyData.append(propertyData)
        self.propertyDataByCategory[name] = propertyData
        if range:
            propertyData.addRange(range[0], range[1])

        return propertyData

    def findPropertyGroupFor(self, categoryAlias):
        category = aliases.generalCategoryForAlias(categoryAlias)
        allCategoryAliases = aliases.generalCategoryAliasesFor(category)
        categoryGroupAlias = categoryAlias[0]
        categoryGroup = aliases.generalCategoryForAlias(categoryGroupAlias)
        allCategoryGroupAlias = aliases.generalCategoryAliasesFor(categoryGroup)
        groupPropertyData = None
        propertyData = None

        if categoryGroup not in self.propertyDataByCategory:
            groupPropertyData = PropertyData(categoryGroup)
            groupPropertyData.setAliases(allCategoryGroupAlias)
            self.allPropertyData.append(groupPropertyData)
            self.propertyDataByCategory[categoryGroup] = groupPropertyData
        else:
            groupPropertyData = self.propertyDataByCategory[categoryGroup]

        if category not in self.propertyDataByCategory:
            propertyData = PropertyData(category)
            propertyData.setAliases(allCategoryAliases)
            self.allPropertyData.append(propertyData)
            self.propertyDataByCategory[category] = propertyData
        else:
            propertyData = self.propertyDataByCategory[category]

        return (groupPropertyData, propertyData)

    def addNextCodePoints(self, categoryAlias, codePoint, highCodePoint=None):
        if codePoint - self.lastAddedCodePoint > 1:
            propertyDatas = self.findPropertyGroupFor("Cn")
            if codePoint - self.lastAddedCodePoint == 2:
                propertyDatas[0].addMatch(self.lastAddedCodePoint + 1)
                propertyDatas[1].addMatch(self.lastAddedCodePoint + 1)
            else:
                propertyDatas[0].addRange(self.lastAddedCodePoint + 1, codePoint - 1)
                propertyDatas[1].addRange(self.lastAddedCodePoint + 1, codePoint - 1)

        propertyDatas = self.findPropertyGroupFor(categoryAlias)
        if highCodePoint:
            propertyDatas[0].addRange(codePoint, highCodePoint)
            propertyDatas[1].addRange(codePoint, highCodePoint)
            if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
                self.casedLetterPropertyData.addRange(codePoint, highCodePoint)
            self.assignedPropertyData.addRange(codePoint, highCodePoint)

            self.lastAddedCodePoint = highCodePoint
        else:
            propertyDatas[0].addMatch(codePoint)
            propertyDatas[1].addMatch(codePoint)
            if categoryAlias == "Ll" or categoryAlias == "Lt" or categoryAlias == "Lu":
                self.casedLetterPropertyData.addMatch(codePoint)
            self.assignedPropertyData.addMatch(codePoint)

            self.lastAddedCodePoint = codePoint

    def parse(self):
        lastLineFirstOfRange = None
        lastLineCodePoint = 0
        for line in self.file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = line.split(';')
            if (not fields):
                continue

            codePoint = int(fields[0].strip(), 16)
            description = fields[1].strip()
            categoryAlias = fields[2].strip()

            if lastLineFirstOfRange:
                if description[-5:-1] == "Last":
                    self.addNextCodePoints(categoryAlias, lastLineFirstOfRange, codePoint)
                    lastLineFirstOfRange = None
                    continue
                else:
                    print("Malformed First..Last pair of lines")

            if description[-6:-1] == "First":
                lastLineFirstOfRange = codePoint
                continue

            self.addNextCodePoints(categoryAlias, codePoint)

        if self.lastAddedCodePoint < MaxUnicode:
            propertyDatas = self.findPropertyGroupFor("Cn")
            if MaxUnicode - self.lastAddedCodePoint == 1:
                propertyDatas[0].addMatch(MaxUnicode)
                propertyDatas[1].addMatch(MaxUnicode)
            else:
                propertyDatas[0].addRange(self.lastAddedCodePoint + 1, MaxUnicode)
                propertyDatas[1].addRange(self.lastAddedCodePoint + 1, MaxUnicode)

    def dump(self, file):
        file.write("// General_Category:\n")
        PropertyData.createAndDumpHashTable(file, self.propertyDataByCategory, "generalCategory")


class BinaryProperty:
    def __init__(self):
        self.allPropertyData = []
        self.propertyDataByProperty = {}

    def parsePropertyFile(self, file):
        currentPropertyName = None
        currentPropertyData = None

        for line in file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = line.split(';')
            if (not fields):
                continue

            codePoints = fields[0].strip()
            propertyName = fields[1].strip()

            if propertyName != currentPropertyName:
                if propertyName not in SupportedBinaryProperties:
                    continue

                currentPropertyName = propertyName
                currentPropertyData = PropertyData(propertyName)
                currentPropertyData.setAliases(aliases.globalAliasesFor(propertyName))
                self.allPropertyData.append(currentPropertyData)
                self.propertyDataByProperty[propertyName] = currentPropertyData

            dotDot = codePoints.find("..")
            if dotDot == -1:
                currentPropertyData.addMatch(int(codePoints, 16))
            else:
                currentPropertyData.addRange(int(codePoints[:dotDot], 16), int(codePoints[dotDot + 2:], 16))

    def dump(self, file):
        file.write("// binary properties:\n")
        PropertyData.createAndDumpHashTable(file, self.propertyDataByProperty, "binaryProperty")

if __name__ == "__main__":
    parser = optparse.OptionParser(usage="usage: %prog <UCD-Directory> <YarrUnicodePropertyData.h>")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("<UCD-Directory> <YarrUnicodePropertyData.h>")

    UCDDirectoryPath = args[0]
    unicodeProertyDataHPath = args[1]

    verifyUCDFilesExist()

    propertyAliasesFile = openUCDFileOrExit("PropertyAliases.txt")
    propertyValueAliasesFile = openUCDFileOrExit("PropertyValueAliases.txt")
    scriptsFile = openUCDFileOrExit("Scripts.txt")
    scriptExtensionsFile = openUCDFileOrExit("ScriptExtensions.txt")
    unicodeDataFile = openUCDFileOrExit("UnicodeData.txt")
    derivedBinaryPropertiesFile = openUCDFileOrExit("DerivedBinaryProperties.txt")
    derivedCorePropertiesFile = openUCDFileOrExit("DerivedCoreProperties.txt")
    derivedNormalizationPropertiesFile = openUCDFileOrExit("DerivedNormalizationProps.txt")
    propListFile = openUCDFileOrExit("PropList.txt")
    emojiDataFile = openUCDFileOrExit("emoji-data.txt")

    aliases = Aliases()

    propertyDataHFile = openOrExit(unicodeProertyDataHPath, "wb")

    propertyDataHFile.write(header)

    aliases.parsePropertyAliasesFile(propertyAliasesFile)
    aliases.parsePropertyValueAliasesFile(propertyValueAliasesFile)

    generalCategory = GeneralCategory(unicodeDataFile)
    generalCategory.parse()

    binaryProperty = BinaryProperty()
    binaryProperty.parsePropertyFile(derivedBinaryPropertiesFile)
    binaryProperty.parsePropertyFile(derivedCorePropertiesFile)
    binaryProperty.parsePropertyFile(derivedNormalizationPropertiesFile)
    binaryProperty.parsePropertyFile(propListFile)
    binaryProperty.parsePropertyFile(emojiDataFile)

    scripts = Scripts()
    scripts.parseScriptsFile(scriptsFile)
    scripts.parseScriptExtensionsFile(scriptExtensionsFile)

    PropertyData.dumpAll(propertyDataHFile)
    generalCategory.dump(propertyDataHFile)
    binaryProperty.dump(propertyDataHFile)
    scripts.dump(propertyDataHFile)

    propertyDataHFile.write(footer)

    exit(0)