genUnicode.py [plain text]

#!/usr/bin/python -u
import sys
import string
import time

sources = "Blocks-4.txt UnicodeData-3.1.0.txt"

try:
    blocks = open("Blocks-4.txt", "r")
except:
    print "Missing Blocks-4.txt, aborting ..."
    sys.exit(1)

BlockNames = {}
for line in blocks.readlines():
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
	continue
    try:
	fields = string.split(line, ';')
	range = string.strip(fields[0])
	(start, end) = string.split(range, "..")
	name = string.strip(fields[1])
	name = string.replace(name, ' ', '')
    except:
        print "Failed to process line: %s" % (line)
	continue
    BlockNames[name] = ("0x"+start, "0x"+end)
blocks.close()
print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))

try:
    data = open("UnicodeData-3.1.0.txt", "r")
except:
    print "Missing UnicodeData-3.1.0.txt, aborting ..."
    sys.exit(1)

nbchar = 0;
Categories = {}
for line in data.readlines():
    if line[0] == '#':
        continue
    line = string.strip(line)
    if line == '':
	continue
    try:
	fields = string.split(line, ';')
	point = string.strip(fields[0])
	value = 0
	while point != '':
	    value = value * 16
	    if point[0] >= '0' and point[0] <= '9':
	        value = value + ord(point[0]) - ord('0')
	    elif point[0] >= 'A' and point[0] <= 'F':
	        value = value + 10 + ord(point[0]) - ord('A')
	    elif point[0] >= 'a' and point[0] <= 'f':
	        value = value + 10 + ord(point[0]) - ord('a')
	    point = point[1:]
	name = fields[2]
    except:
        print "Failed to process line: %s" % (line)
	continue
    
    nbchar = nbchar + 1
    try:
	Categories[name].append(value)
    except:
        try:
	    Categories[name] = [value]
	except:
	    print "Failed to process line: %s" % (line)
    try:
	Categories[name[0]].append(value)
    except:
        try:
	    Categories[name[0]] = [value]
	except:
	    print "Failed to process line: %s" % (line)
	
blocks.close()
print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
#reduce the number list into ranges
for cat in Categories.keys():
    list = Categories[cat]
    start = -1
    prev = -1
    end = -1
    ranges = []
    for val in list:
        if start == -1:
	    start = val
	    prev = val
	    continue
	elif val == prev + 1:
	    prev = val
	    continue
	elif prev == start:
	    ranges.append((prev, prev))
	    start = val
	    prev = val
	    continue
	else:
	    ranges.append((start, prev))
	    start = val
	    prev = val
	    continue
    if prev == start:
        ranges.append((prev, prev))
    else:
        ranges.append((start, prev))
    Categories[cat] = ranges
        
#
# Generate the resulting files
#
try:
    header = open("xmlunicode.h", "w")
except:
    print "Failed to open xmlunicode.h"
    sys.exit(1)

try:
    output = open("xmlunicode.c", "w")
except:
    print "Failed to open xmlunicode.c"
    sys.exit(1)

date = time.asctime(time.localtime(time.time()))

header.write(
"""/*
 * xmlunicode.h: this header exports interfaces for the Unicode character APIs
 *
 * This file is automatically generated from the
 * UCS description files of the Unicode Character Database
 * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
 * using the genUnicode.py Python script.
 *
 * Generation date: %s
 * Sources: %s
 * Daniel Veillard <veillard@redhat.com>
 */

#ifndef __XML_UNICODE_H__
#define __XML_UNICODE_H__

#ifdef __cplusplus
extern "C" {
#endif

""" % (date, sources));
output.write(
"""/*
 * xmlunicode.c: this module implements the Unicode character APIs
 *
 * This file is automatically generated from the
 * UCS description files of the Unicode Character Database
 * http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html
 * using the genUnicode.py Python script.
 *
 * Generation date: %s
 * Sources: %s
 * Daniel Veillard <veillard@redhat.com>
 */

#define IN_LIBXML
#include "libxml.h"

#ifdef LIBXML_UNICODE_ENABLED

#include <string.h>
#include <libxml/xmlversion.h>
#include <libxml/xmlunicode.h>

""" % (date, sources));

keys = BlockNames.keys()
keys.sort()
for block in keys:
    (start, end) = BlockNames[block]
    name = string.replace(block, '-', '')
    header.write("int\txmlUCSIs%s\t(int code);\n" % name)
    output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
    output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
                 (block))
    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
    output.write("int\nxmlUCSIs%s(int code) {\n" % name)
    output.write("    return((code >= %s) && (code <= %s));\n" % (start, end))
    output.write("}\n\n")

header.write("\nint\txmlUCSIsBlock\t(int code,\n\t\t\t const char *block);\n\n")
output.write("/**\n * xmlUCSIsBlock:\n * @code: UCS code point\n")
output.write(" * @block: UCS block name\n")
output.write(" *\n * Check whether the caracter is part of the UCS Block\n")
output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown block\n */\n");
output.write("int\nxmlUCSIsBlock(int code, const char *block) {\n")
keys = BlockNames.keys()
keys.sort()
for block in keys:
    name = string.replace(block, '-', '')
    output.write("    if (!strcmp(block, \"%s\"))\n        return(xmlUCSIs%s(code));\n" %
                 (block, name));
output.write("    return(-1);\n}\n\n")


keys = Categories.keys()
keys.sort()
for name in keys:
    ranges = Categories[name]
    header.write("int\txmlUCSIsCat%s\t(int code);\n" % name)
    output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
    output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
                 (name))
    output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
    output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
    start = 1
    for range in ranges:
        (begin, end) = range;
	if start:
	    output.write("    return(");
	    start = 0
	else:
	    output.write(" ||\n           ");
	if (begin == end):
	    output.write("(code == %s)" % (hex(begin)))
	else:
	    output.write("((code >= %s) && (code <= %s))" % (
	                 hex(begin), hex(end)))
    output.write(");\n}\n\n")

header.write("\nint\txmlUCSIsCat\t(int code,\n\t\t\t const char *cat);\n")
output.write("/**\n * xmlUCSIsCat:\n * @code: UCS code point\n")
output.write(" * @cat: UCS Category name\n")
output.write(" *\n * Check whether the caracter is part of the UCS Category\n")
output.write(" *\n * Returns 1 if true, 0 if false and -1 on unknown category\n */\n");
output.write("int\nxmlUCSIsCat(int code, const char *cat) {\n")
keys = Categories.keys()
keys.sort()
for name in keys:
    output.write("    if (!strcmp(cat, \"%s\"))\n        return(xmlUCSIsCat%s(code));\n" %
                 (name, name));
output.write("    return(-1);\n}\n\n")

header.write("""
#ifdef __cplusplus
}
#endif
#endif /* __XML_UNICODE_H__ */
""");
output.write("""
#endif /* LIBXML_UNICODE_ENABLED */
""");
header.close()
output.close()