generate_normalize_data.pl [plain text]
use strict;
use lib qw(.);
use Getopt::Long;
use UCD;
use SparseMap;
use constant UCS_MAX => 0x110000;
use constant END_BIT => 0x80000000;
my $DECOMP_COMPAT_BIT = 0x8000;
my $CASEMAP_FINAL_BIT = 0x1;
my $CASEMAP_NONFINAL_BIT = 0x2;
my $CASEMAP_LAST_BIT = 0x10;
my $LETTER_BIT = 1;
my $NSPMARK_BIT = 2;
(my $myid = '$Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;
my @default_bits = (9, 7, 5);
my @canon_class_bits = @default_bits;
my @decomp_bits = @default_bits;
my @comp_bits = @default_bits;
my @folding_bits = @default_bits;
my @casemap_bits = @default_bits;
my @casemap_ctx_bits = @default_bits;
my $prefix = '';
my $dir = '.';
my $unicodedatafile = 'UnicodeData.txt';
my $exclusionfile = 'CompositionExclusions.txt';
my $specialcasefile = 'SpecialCasing.txt';
my $casefoldingfile = 'CaseFolding.txt';
my $verbose;
GetOptions('dir|d=s' => \$dir,
'unicodedata|u=s' => \$unicodedatafile,
'exclude|e=s' => \$exclusionfile,
'specialcase|s=s' => \$specialcasefile,
'casefold|c=s' => \$casefoldingfile,
'prefix|p=s' => \$prefix,
'verbose|v' => \$verbose,
) or usage();
foreach my $r (\$unicodedatafile, \$exclusionfile,
\$specialcasefile, \$casefoldingfile) {
$$r = "$dir/$$r" unless $$r =~ m|^/|;
}
my %exclusions;
my %lower_special;
my %upper_special;
my @decomp_data;
my @comp_data;
my @toupper_data;
my @tolower_data;
my @folding_data;
my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
my $decomp = SparseMap::Int->new(BITS => [@decomp_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
my $comp = SparseMap::Int->new(BITS => [@comp_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
my $upper = SparseMap::Int->new(BITS => [@casemap_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
my $lower = SparseMap::Int->new(BITS => [@casemap_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
my $folding = SparseMap::Int->new(BITS => [@folding_bits],
MAX => UCS_MAX,
MAPALL => 1,
DEFAULT => 0);
read_exclusion_file();
read_specialcasing_file();
read_unicodedata_file();
read_casefolding_file();
print_header();
print_canon_class();
print_composition();
print_decomposition();
print_casemap();
print_casemap_context();
print_casefolding();
exit;
sub usage {
print STDERR <<"END";
Usage: $0 [options..]
options:
-d DIR directory where Unicode Character Data files resides [./]
-u FILE name of the UnicodeData file [UnicodeData.txt]
-e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt]
-s FILE name of the SpecialCasing file [SpecialCasing.txt]
-c FILE name of the CaseFolding file [CaseFolding.txt]
END
exit 1;
}
sub read_exclusion_file {
open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n";
while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) {
my %data = UCD::CompositionExclusions::parseline($_);
$exclusions{$data{CODE}} = 1;
}
close EXCLUDE;
}
sub read_specialcasing_file {
open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n";
while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) {
my %data = UCD::SpecialCasing::parseline($_);
my $code = $data{CODE};
my $lower = $data{LOWER};
my $upper = $data{UPPER};
my $cond = $data{CONDITION} || '';
next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/;
if (defined $cond && (@$lower > 1 || $lower->[0] != $code)
or @$lower > 1 or $lower->[0] != $code) {
$lower_special{$code} = [$lower, $cond];
}
if (defined $cond && (@$upper > 1 || $upper->[0] != $code)
or @$upper > 1 or $upper->[0] != $code) {
$upper_special{$code} = [$upper, $cond];
}
}
close SPCASE;
}
sub read_unicodedata_file {
open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n";
@decomp_data = (0);
@toupper_data = (0);
@tolower_data = (0);
my @comp_cand; my %nonstarter;
while ($_ = UCD::UnicodeData::getline(\*UCD)) {
my %data = UCD::UnicodeData::parseline($_);
my $code = $data{CODE};
if ($data{CLASS} > 0) {
$nonstarter{$code} = 1;
$canon_class->add($code, $data{CLASS});
}
if (exists $upper_special{$code} or defined $data{UPPER}) {
my $offset = @toupper_data;
my @casedata;
$upper->add($code, $offset);
if (exists $upper_special{$code}) {
push @casedata, $upper_special{$code};
}
if (defined $data{UPPER}) {
push @casedata, $data{UPPER};
}
push @toupper_data, casemap_data(@casedata);
}
if (exists $lower_special{$code} or defined $data{LOWER}) {
my $offset = @tolower_data;
my @casedata;
$lower->add($code, $offset);
if (exists $lower_special{$code}) {
push @casedata, $lower_special{$code};
}
if (defined $data{LOWER}) {
push @casedata, $data{LOWER};
}
push @tolower_data, casemap_data(@casedata);
}
if ($data{DECOMP}) {
my ($tag, @decomp) = @{$data{DECOMP}};
my $offset = @decomp_data;
if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) {
push @comp_cand, [$code, @decomp];
}
if ($tag ne '') {
$offset |= $DECOMP_COMPAT_BIT;
}
$decomp->add($code, $offset);
push @decomp_data, @decomp;
$decomp_data[-1] |= END_BIT;
}
if ($data{CATEGORY} =~ /L[ult]/) {
$casemap_ctx->add($code, $LETTER_BIT);
} elsif ($data{CATEGORY} eq 'Mn') {
$casemap_ctx->add($code, $NSPMARK_BIT);
}
}
close UCD;
@comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand;
@comp_data = ([0, 0, 0]);
my $last_code = -1;
my $last_offset = @comp_data;
for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) {
if ($r->[1] != $last_code) {
$comp->add($last_code,
($last_offset | ((@comp_data - $last_offset)<<16)))
unless $last_code == -1;
$last_code = $r->[1];
$last_offset = @comp_data;
}
push @comp_data, $r;
}
$comp->add($last_code,
($last_offset | ((@comp_data - $last_offset)<<16)));
}
sub casemap_data {
my @data = @_;
my @result = ();
while (@data > 0) {
my $r = shift @data;
my $flag = 0;
if (ref $r) {
if ($r->[1] eq 'FINAL') {
$flag |= $CASEMAP_FINAL_BIT;
} elsif ($r->[1] eq 'NON_FINAL') {
$flag |= $CASEMAP_NONFINAL_BIT;
} elsif ($r->[1] ne '') {
die "unknown condition \"", $r->[1], "\"\n";
}
}
$flag |= $CASEMAP_LAST_BIT if @data == 0;
push @result, $flag;
push @result, (ref $r) ? @{$r->[0]} : $r;
$result[-1] |= END_BIT;
}
@result;
}
sub read_casefolding_file {
open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n";
@folding_data = (0);
while ($_ = UCD::CaseFolding::getline(\*FOLD)) {
my %data = UCD::CaseFolding::parseline($_);
$folding->add($data{CODE}, scalar(@folding_data));
push @folding_data, @{$data{MAP}};
$folding_data[-1] |= END_BIT;
}
close FOLD;
}
sub print_header {
print <<"END";
/* \$Id\$ */
/* $myid */
/*
* Do not edit this file!
* This file is generated from UnicodeData.txt, CompositionExclusions-1.txt,
* SpecialCasing.txt and CaseFolding.txt.
*/
END
}
sub print_canon_class {
$canon_class->fix();
print STDERR "** cannon_class\n", $canon_class->stat() if $verbose;
print <<"END";
/*
* Canonical Class
*/
END
print_bits("CANON_CLASS", @canon_class_bits);
print "\n";
print $canon_class->cprog(NAME => "${prefix}canon_class");
}
sub print_composition {
$comp->fix();
print STDERR "** composition\n", $comp->stat() if $verbose;
print <<"END";
/*
* Canonical Composition
*/
END
print_bits("CANON_COMPOSE", @comp_bits);
print "\n";
print $comp->cprog(NAME => "${prefix}compose");
print <<"END";
static const struct composition ${prefix}compose_seq[] = {
END
my $i = 0;
foreach my $r (@comp_data) {
if ($i % 2 == 0) {
print "\n" if $i != 0;
print "\t";
}
printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0];
$i++;
}
print "\n};\n\n";
}
sub print_decomposition {
$decomp->fix();
print STDERR "** decomposition\n", $decomp->stat() if $verbose;
print <<"END";
/*
* Canonical/Compatibility Decomposition
*/
END
print_bits("DECOMP", @decomp_bits);
print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n";
print $decomp->cprog(NAME => "${prefix}decompose");
print "static const unsigned long ${prefix}decompose_seq[] = {\n";
print_ulseq(@decomp_data);
print "};\n\n";
}
sub print_casemap {
$upper->fix();
$lower->fix();
print STDERR "** upper mapping\n", $upper->stat() if $verbose;
print STDERR "** lower mapping\n", $lower->stat() if $verbose;
print <<"END";
/*
* Lowercase <-> Uppercase mapping
*/
/*
* Flags for special case mapping.
*/
END
print_bits("CASEMAP", @casemap_bits);
print "\n";
print $upper->cprog(NAME => "${prefix}toupper");
print $lower->cprog(NAME => "${prefix}tolower");
print "static const unsigned long ${prefix}toupper_seq[] = {\n";
print_ulseq(@toupper_data);
print "};\n\n";
print "static const unsigned long ${prefix}tolower_seq[] = {\n";
print_ulseq(@tolower_data);
print "};\n\n";
}
sub print_casefolding {
$folding->fix();
print STDERR "** case folding\n", $folding->stat() if $verbose;
print <<"END";
/*
* Case Folding
*/
END
print_bits("CASE_FOLDING", @folding_bits);
print "\n";
print $folding->cprog(NAME => "${prefix}case_folding");
print "static const unsigned long ${prefix}case_folding_seq[] = {\n";
print_ulseq(@folding_data);
print "};\n\n";
}
sub print_casemap_context {
$casemap_ctx->fix();
print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose;
print <<"END";
/*
* Cased characters and non-spacing marks (for casemap context)
*/
END
print_bits("CASEMAP_CTX", @casemap_ctx_bits);
print <<"END";
END
print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx");
}
sub sprint_composition_hash {
my $i = 0;
my $s = '';
foreach my $r (@_) {
if ($i % 2 == 0) {
$s .= "\n" if $i != 0;
$s .= "\t";
}
$s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r};
$i++;
}
$s;
}
sub print_bits {
my $prefix = shift;
my $i = 0;
foreach my $bit (@_) {
print "#define ${prefix}_BITS_$i\t$bit\n";
$i++;
}
}
sub print_ulseq {
my $i = 0;
foreach my $v (@_) {
if ($i % 4 == 0) {
print "\n" if $i != 0;
print "\t";
}
printf "0x%08x, ", $v;
$i++;
}
print "\n";
}