generateYarrCanonicalizeUnicode [plain text]

#! /usr/bin/python

# Copyright (C) 2016 Apple Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1.  Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer. 
# 2.  Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution. 
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# This tool processes the Unicode Character Database file CaseFolding.txt to create
# canonicalization table as decribed in ECMAScript 6 standard in section
# "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2.

import optparse
import re
import sys
from sets import Set

header = """/*
* Copyright (C) 2016 Apple Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1.  Redistributions of source code must retain the above copyright
*     notice, this list of conditions and the following disclaimer. 
* 2.  Redistributions in binary form must reproduce the above copyright
*     notice, this list of conditions and the following disclaimer in the
*     documentation and/or other materials provided with the distribution. 
*
* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

// DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode

#include "config.h"
#include "YarrCanonicalize.h"

namespace JSC { namespace Yarr {

"""

footer = """} } // JSC::Yarr
"""

MaxUnicode = 0x10ffff
commonAndSimpleLinesRE = re.compile(r"(?P<code>[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P<mapping>[0-9A-F]+)", re.IGNORECASE)

def openOrExit(path, mode):
    try:
        return open(path, mode)
    except IOError as e:
        print "I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)
        exit(1)

class Canonicalize:
    def __init__(self):
        self.canonicalGroups = {};

    def addMapping(self, code, mapping):
        if mapping not in self.canonicalGroups:
            self.canonicalGroups[mapping] = []
        self.canonicalGroups[mapping].append(code)
        
    def readCaseFolding(self, file):
        codesSeen = Set()
        for line in file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = commonAndSimpleLinesRE.match(line)
            if (not fields):
                continue

            code = int(fields.group('code'), 16)
            mapping = int(fields.group('mapping'), 16)

            codesSeen.add(code)
            self.addMapping(code, mapping)

        for i in range(MaxUnicode + 1):
            if i in codesSeen:
                continue;

            self.addMapping(i, i)

    def createTables(self, file):
        typeInfo = [""] * (MaxUnicode + 1)
        characterSets = []

        for mapping in sorted(self.canonicalGroups.keys()):
            characters = self.canonicalGroups[mapping]
            if len(characters) == 1:
                typeInfo[characters[0]] = "CanonicalizeUnique:0"
            else:
                characters.sort()
                if len(characters) > 2:
                    for ch in characters:
                        typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets)
                    characterSets.append(characters)
                else:
                    low = characters[0]
                    high = characters[1]
                    delta = high - low
                    if delta == 1:
                        type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0"
                        typeInfo[low] = type
                        typeInfo[high] = type
                    else:
                        typeInfo[low] = "CanonicalizeRangeLo:%d" % delta
                        typeInfo[high] = "CanonicalizeRangeHi:%d" % delta

        rangeInfo = []
        end = 0
        while end <= MaxUnicode:
            begin = end
            type = typeInfo[end]
            while end < MaxUnicode and typeInfo[end + 1] == type:
                end = end + 1
            rangeInfo.append({"begin": begin, "end": end, "type": type})
            end = end + 1
        
        for i in range(len(characterSets)):
            characters = ""
            set = characterSets[i]
            for ch in set:
                characters = characters + "0x{character:04x}, ".format(character=ch)
            file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters))

        file.write("\n")
        file.write("static const size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets)))
        file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n")

        for i in range(len(characterSets)):
            file.write("    unicodeCharacterSet{setNumber:d},\n".format(setNumber=i))

        file.write("};\n")
        file.write("\n")
        file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo)))
        file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n")

        for info in rangeInfo:
            typeAndValue = info["type"].split(":")
            file.write("    {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0]))

        file.write("};\n")
        file.write("\n")

        
if __name__ == "__main__":
    parser = optparse.OptionParser(usage = "usage: %prog  <CaseFolding.txt> <YarrCanonicalizeUnicode.h>")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("<CaseFolding.txt> <YarrCanonicalizeUnicode.h>")

    caseFoldingTxtPath = args[0]
    canonicalizeHPath = args[1]
    caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r")
    canonicalizeHFile = openOrExit(canonicalizeHPath, "wb")

    canonicalize = Canonicalize()
    canonicalize.readCaseFolding(caseFoldingTxtFile)

    canonicalizeHFile.write(header);
    canonicalize.createTables(canonicalizeHFile)
    canonicalizeHFile.write(footer);

    caseFoldingTxtFile.close()
    canonicalizeHFile.close()

    exit(0)