generate_from_icu   [plain text]


#!/usr/bin/perl -w

use strict;

use 5.006;

use DateTime;
use File::Basename;
use File::Copy;
use File::Spec;
use Getopt::Long;
use XML::Simple;

my $VERSION = "0.02";

my ( %LocalesWithoutData, %XML, %Parents,
     %LanguageLookup, %TerritoryLookup, %ScriptLookup );

my $ScriptName = basename($0);

my %Aliases =
    (
     'C'             => 'en_US_POSIX',
     'POSIX'         => 'en_US_POSIX',
     # Apparently the Hebrew locale code was changed from iw to he at
     # one point.
     'iw'            => 'he',
     'iw_IL'         => 'he_IL',
     # ICU got rid of no
     'no'            => 'nn',
     'no_NO'         => 'nn_NO',
     'no_NO_NY'      => 'nn_NO',
    );

# it's an array because the order in which the regexes are checked is
# important
my @JavaPatterns =
    ( qr/G/     => '{era}',
      qr/yyyy/  => '{ce_year}',
      qr/y/     => 'y',
      qr/u/     => 'Y',
      qr/MMMM/  => 'B',
      qr/MMM/   => 'b',
      qr/MM/    => 'm',
      qr/M/     => '{month}',
      qr/dd/    => 'd',
      qr/d/     => '{day}',
      qr/hh/    => 'l',
      qr/h/     => '{hour_12}',
      qr/HH/    => 'H',
      qr/H/     => '{hour}',
      qr/mm/    => 'M',
      qr/m/     => '{minute}',
      qr/ss/    => 'S',
      qr/s/     => '{second}',
      qr/S/     => 'N',
      qr/EEEE/  => 'A',
      qr/E/     => 'a',
      qr/D/     => 'j',
      qr/F/     => '{weekday_of_month}',
      qr/w/     => 'V',
      qr/W/     => '{week_month}',
      qr/a/     => 'p',
      qr/k/     => '{hour_1}',
      qr/K/     => '{hour_12_0}',
      qr/z/     => '{time_zone_long_name}',
    );

my @FormatLengths = qw( full long medium short  );

my %opts;

main();

sub main
{
    GetOptions( 'dir:s'   => \$opts{dir},
                'name:s'  => \$opts{name},
                'file:s'  => \$opts{file},
                'quiet'   => \$opts{quiet},
                'verbose' => \$opts{verbose},
                'help'    => \$opts{help},
              );

    $opts{help} = 1
        unless defined $opts{dir} && -d $opts{dir};

    usage() if $opts{help};

    unless ( $opts{name} )
    {
        copy( 'MANIFEST.base', 'MANIFEST' );
        open MAN, ">>MANIFEST" or die "Cannot write to MANIFEST: $!";
    }

    binmode STDOUT, ':utf8' if $opts{verbose};
    $| = 1;

    generate_from_ICU_xml();
}

sub generate_from_ICU_xml
{
    my @files = glob File::Spec->catfile( $opts{dir}, '*.xml' );

    print "Reading ICU files from: '$opts{dir}'...\n"
        unless $opts{quiet};

    foreach my $f (@files)
    {
        if ( $opts{file} )
        {
            next unless $opts{file} eq basename($f);
        }

        my $id = basename($f)
            or die "Could not get basename for: '$f'";
        $id =~ s/\.xml$//i;

        next if $id eq 'supplementalData';

        if ( $id =~ /^iw/ )
        {
            print "  skipping $id - iw/he Hebrew duplication, prefer 'he'\n"
                unless $opts{quiet};
            next;
        }

        print "  $id" unless $opts{quiet};

        die "Found a file matching an alias: $id\n" if $Aliases{$id};

        # This is the fastest one from my testing
        local $XML::Simple::PREFERRED_PARSER = 'XML::LibXML::SAX';

        my %key_attr = ( calendar => 'type',
                         monthContext => 'type',
                         dayContext   => 'type',
                         monthWidth => 'type',
                         dayWidth   => 'type',
                         month => 'type',
                         day   => 'type',
                         era   => 'type',
                         timeFormatLength => 'type',
                         dateFormatLength => 'type',
                       );
        my $xml =
            XMLin( $f,
                   KeyAttr   => \%key_attr,
                   GroupTags => { eras    => 'eraAbbr',
                                  eraAbbr => 'era',
                                },
                   Forcearray => [ keys %key_attr ],
                   ForceContent => 1,
                 )
                or die "Missing XML for locale: '$id'";

        delete $xml->{numbers};
        delete $xml->{posix};

        $XML{$id} = { xml         => $xml,
                      source_file => basename($f),
                    };

        build_language_and_territory($id, $xml);

        $Parents{$id} = $xml->{alias}{source}
            if $xml->{alias};

        print "\n" unless $opts{quiet};
    }

    print "\nGenerating PM files\n" if $opts{verbose};

    foreach my $id ( sort keys %XML )
    {
        my $xml = $XML{$id}{xml};

        my $cal = $xml->{dates}{calendars}{calendar}{gregorian};

        my $hash;
        $hash->{id}        = $id;
        $hash->{parent_id} = parent_id($id);

        print "\n  $id\n" if $opts{verbose};
        print "    parent_id: $hash->{parent_id}\n" if $opts{verbose};

        get_version( $hash, $xml );
        get_generation_date( $hash, $xml );
        get_days( $hash, $cal );
        get_months( $hash, $cal );
        get_formats( $hash, $cal );
        get_am_pm_eras( $hash, $cal );

        user_supplied_changes($hash);

        unless ( has_data($hash) )
        {
            my $parent = parent_id($id);
            $LocalesWithoutData{$id} = $parent;

            print "\n  $id has no data\n" unless $opts{quiet};
            next;
        }

        generate_locale($hash);
    }

    generate_name_lookup();

    print "\nAll done\n" unless $opts{quiet};
}

sub build_language_and_territory
{
    my ( $id, $xml ) = @_;

    _build_lookup
        ( \%LanguageLookup,  $id, $xml->{localeDisplayNames}{languages}{language} );

    _build_lookup
        ( \%TerritoryLookup, $id, $xml->{localeDisplayNames}{territories}{territory} );

    _build_lookup
        ( \%ScriptLookup,    $id, $xml->{localeDisplayNames}{scripts}{script} );
}

sub _build_lookup
{
    my ( $lookup, $id, $xml ) = @_;

    return unless $xml;

    my ($lang) = split /_/, $id;          # Interested in language only

    if ( ref $xml eq "HASH" )
    {
        $lookup->{$lang}{ $xml->{type} } = $xml->{content};
    }
    else
    {
        $lookup->{$lang}{ $_->{type} } = $_->{content} for grep { ! $_->{alt} } @$xml;
    }
}

sub has_data
{
    my $hash = shift;

    return keys %$hash > 2;
}

sub parent_id
{
    my $id = shift;

    return $Parents{$id} if exists $Parents{$id};

    my ($language, $territory, $variant) = split /_/, $id;

    my $parent_id;

    if ( $territory )
    {
        $parent_id  =   $language;
        $parent_id .= "_$territory" if $variant;
    }
    else
    {
        $parent_id = $language =~ /^root$/i ? 'Base' : 'root';
    }

    if ( $LocalesWithoutData{$parent_id} )
    {
        return parent_id($parent_id);
    }

    return $parent_id;
}

sub get_version
{
    my ( $hash, $xml ) = @_;

    $hash->{version} = $xml->{identity}{version}{number};

    print "    version: $hash->{version}\n" if $opts{verbose};
}

sub get_generation_date
{
    my ( $hash, $xml ) = @_;

    $hash->{generation_date} = $xml->{identity}{generation}{date};
    print "    generation date: $hash->{generation_date}\n" if $opts{verbose};
}

sub get_days
{
    my ( $hash, $cal ) = @_;

    _get_days($hash, $cal, 'wide',         'day_names'         );
    _get_days($hash, $cal, 'abbreviated',  'day_abbreviations' );
    _get_days($hash, $cal, 'narrow',       'day_narrows'       );
}

sub _get_days
{
    my ( $hash, $cal, $length, $key ) = @_;

    my $days =
        $cal->{days}{dayContext}{format}{dayWidth}{$length}{day};

    return unless $days;

    my @keys = qw( mon tue wed thu fri sat sun );

    return if grep { ! exists $days->{$_} } @keys;

    @{ $hash->{$key} } = map { $days->{$_}{content} } @keys;

    print "    $key: @{ $hash->{$key} }\n" if $opts{verbose};
}

sub get_months
{
    my ( $hash, $cal ) = @_;

    _get_months($hash, $cal, 'wide',         'month_names'         );
    _get_months($hash, $cal, 'abbreviated',  'month_abbreviations' );
    _get_months($hash, $cal, 'narrow',       'month_narrows'       );
}

sub _get_months
{
    my ($hash, $cal, $length, $key) = @_;

    my $months =
        $cal->{months}{monthContext}{format}{monthWidth}{$length}{month};

    return unless $months;
    return if grep { ! exists $months->{$_} } 1..12;

    @{ $hash->{$key} } = map { $months->{$_}{content} } 1..12;

    print "    $key: @{ $hash->{$key} }\n" if $opts{verbose};
}

sub get_formats
{
    my ( $hash, $cal ) = @_;

    if ( my $dates = $cal->{dateFormats} )
    {
        my %original;
        foreach my $length (@FormatLengths)
        {
            my $pattern = $dates->{dateFormatLength}{$length}{dateFormat}{pattern};
            next unless $pattern;

            my $value;
            if ( ref $pattern eq 'ARRAY' )
            {
                my @non_alt = grep { ! $_->{alt} } @$pattern;

                die "Multiple primary patterns!\n" if @non_alt > 1;

                $value = $non_alt[0]->{content};
            }
            else
            {
                $value = $pattern->{content};
            }

            next unless defined $value;

            $original{$length} = $value;

            $hash->{date_formats}{$length} = simple2strf($value);

            print "    date_formats{$length}: $hash->{date_formats}{$length}\n"
                if $opts{verbose};
        }

        if ( $dates->{default}{type} )
        {
            $hash->{default_date_format_length} = $dates->{default}{type};

            print "    default_date_format_length: $hash->{default_date_format_length}\n"
                if $opts{verbose};
        }

        if ( my $short = $original{short} )
        {
            # Work out the order of the date parts (ymd, dmy, or mdy)
            $short =~ tr{dmyDMY}{}cd;
            $short =~ tr{dmyDMY}{dmydmy}s;

            $hash->{date_parts_order} = $short;

            print "    date_parts_order: $hash->{date_parts_order}\n" if $opts{verbose};
        }
    }

    if ( my $times = $cal->{timeFormats} )
    {
        foreach my $length (@FormatLengths)
        {
            my $pattern = $times->{timeFormatLength}{$length}{timeFormat}{pattern};
            next unless $pattern;

            my $value;
            if ( ref $pattern eq 'ARRAY' )
            {
                my @non_alt = grep { ! $_->{alt} } @$pattern;

                die "Multiple primary patterns!\n" if @non_alt > 1;

                $value = $non_alt[0]->{content};
            }
            else
            {
                $value = $pattern->{content};
            }

            next unless defined $value;

            $hash->{time_formats}{$length} = simple2strf($value);

            print "    time_formats{$length}: $hash->{time_formats}{$length}\n"
                if $opts{verbose};
        }

        if ( $times->{default}{type} )
        {
            $hash->{default_time_format_length} = $times->{default}{type};

            print "    default_time_format_length: $hash->{default_time_format_length}\n"
                if $opts{verbose};
        }
    }

    if ( my $order =
         $cal->{dateTimeFormats}{dateTimeFormatLength}{dateTimeFormat}{pattern}{content} )
    {
        $hash->{date_before_time} = $order eq "{1} {0}" ? 1 : 0;

        print "    date_before_time: $hash->{date_before_time}\n" if $opts{verbose};
    }
}

sub simple2strf
{
    my $simple = shift;

    $simple =~
        s/(G+|y+|u+|M+|d+|h+|H+|m+|s+|S+|E+|D+|F+|w+|W+|a+|k+|K+|z+)|'((?:[^']|'')*)'/
          $2 ? _stringify($2) : $1 ? _convert($1) : "'"/eg;

    return $simple;
}

sub _convert
{
    my $simple = shift;

    for ( my $x = 0; $x < @JavaPatterns; $x += 2 )
    {
        return '%' . $JavaPatterns[ $x + 1 ] if $simple =~ /$JavaPatterns[$x]/;
    }

    die "**Dont know $simple***";
}

sub _stringify
{
    my $string = shift;

    $string =~ s/%(?:[^%])/%%/g;
    $string =~ s/\'\'/\'/g;

    return $string;
}

# used to hardcode corrections that aren't yet in ICU
sub user_supplied_changes
{
    my $hash = shift;

    if ( $hash->{id} eq 'az' )
    {
        # The az.xml file appears to have a mistake in the wide day names
        $hash->{day_names}[4] =~ s/ \w+$//;
    }
}

sub get_am_pm_eras
{
    my ( $hash, $cal ) = @_;

    if ( $cal->{am} )
    {
        $hash->{am_pms} = [ $cal->{am}{content}, $cal->{pm}{content} ];

        print "    am_pms: [@{ $hash->{am_pms} }]\n" if $opts{verbose};
    }

    if ( $cal->{eras} )
    {
        my $eras = $cal->{eras}{eraAbbr} ? $cal->{eras}{eraAbbr} : $cal->{eras};

        # At least one local (Lao) only defines one era (missing CE)
        if ( $cal->{eras}{0} && $cal->{eras}{1} )
        {
            $hash->{eras} =
                [ $eras->{0}{content},
                  $eras->{1}{content} ];

            print "    eras: [@{ $hash->{eras} }]\n" if $opts{verbose};
        }
    }
}

sub generate_locale
{
    my $hash = shift;

    my $vars = "";
    my $subs = "";

    foreach my $var
        ( qw( day_names
              day_abbreviations
              day_narrows

              month_names
              month_abbreviations
              month_narrows

              am_pms
              eras
            ) )
    {
        next unless $hash->{$var};

        $vars .= "my \@$var = (\n";

        foreach my $val ( @{ $hash->{$var} } )
        {
            $vars .= qq|"\Q$val\E",\n|;
        }

        $vars .= ");\n\n";

        $subs .= sprintf "sub %-30s { \\\@%s }\n", $var, $var;
    }

    foreach my $var ( qw( date_formats
                          time_formats
                        ) )
    {
        next unless $hash->{$var};

        for my $length (@FormatLengths)
        {
            next unless $hash->{$var}{$length};

            my $sub_name = join '_', $length, $var;
            $sub_name =~ s/s$//;

            $subs .=
                sprintf "sub %-30s { %s }\n", $sub_name, qq|"\Q$hash->{$var}{$length}\E"|;
        }
    }

    # The accessor subs and whether they should be public (1) or
    # private (0).  Private subs are prefixed with _.
    my %accessors =
	( date_before_time           => 1,
	  date_parts_order           => 1,
	  default_date_format_length => 0,
	  default_time_format_length => 0,
	  );

    while ( my ( $var, $public ) = each %accessors )
    {
        next unless $hash->{$var};

        $vars .= qq|my \$$var = "\Q$hash->{$var}\E";\n|;

        my $sub_name = $public ? $var : "_$var";

        $subs .= sprintf "sub %-30s { \$%s }\n", $sub_name, $var;
    }

    my $class         = $hash->{id};
    my $parent        = $hash->{parent_id};

    my $id = $hash->{id};

    my $file = File::Spec->catfile( 'lib', 'DateTime', 'Locale', "$class.pm");

    local *OUTFILE;

    if ( $] >= 5.008 )
    {
        open OUTFILE, ">:utf8", $file or die $!;
    }
    else
    {
        open OUTFILE, ">$file" or die $!;
    }

    print OUTFILE <<"EOF" or die "print failed: $!";
###########################################################################
#
# This file is auto-generated by the Perl DateTime Suite time locale
# generator ($VERSION).  This code generator comes with the
# DateTime::Locale distribution in the tools/ directory, and is called
# $ScriptName.
#
# This file as generated from the ICU XML locale data.  See the
# LICENSE.icu file included in this distribution for license details.
#
# This file was generated from the source file $XML{$id}{source_file}.
# The source file version number was $hash->{version}, generated on
# $hash->{generation_date}.
#
# Do not edit this file directly.
#
###########################################################################

package DateTime::Locale::$class;

use strict;

BEGIN
{
    if ( \$] >= 5.006 )
    {
        require utf8; utf8->import;
    }
}

use DateTime::Locale::$parent;

\@DateTime::Locale::${class}::ISA = qw(DateTime::Locale::$parent);

$vars

$subs


1;

EOF

    close OUTFILE or warn $!;

    print MAN "$file\n";
}

sub generate_name_lookup
{
    local *OUTFILE;
    if ( $] >= 5.008 )
    {
        open OUTFILE, '>:utf8', File::Spec->catfile( 'lib', 'DateTime', 'LocaleCatalog.pm' )
            or die "$!";
    }
    else
    {
        open OUTFILE, '>' . File::Spec->catfile( 'lib', 'DateTime', 'LocaleCatalog.pm' )
            or die "$!";
    }

    print OUTFILE <<'EOF';
package DateTime::LocaleCatalog;

use strict;

BEGIN
{
    return unless $] >= 5.006;

    require utf8; import utf8;
}

EOF

    my $locales_in_pod = '';

    print OUTFILE "\@DateTime::Locale::Locales = (\n";

    my $locale_with_full_names =
        $LanguageLookup{root}{en} && length $LanguageLookup{root}{en} > 3 ? 'root' : 'en';

    for my $id ( sort keys %XML )
    {
        my ($lang, $script, $territory, $variant ) =
            $id =~ /([a-z]+)               # id
                    (?: _([A-Z][a-z]+) )?  # script - Title Case - optional
                    (?: _([A-Z]+) )?       # territory - ALL CAPS - optional
                    (?: _([A-Z]+) )?       # variant - ALL CAPS - optional
                   /x;

        my %lookup;

        $lookup{en_language} = $LanguageLookup{$locale_with_full_names}{$lang};
        $lookup{native_language} =
            $LanguageLookup{$lang}{$lang} || $LanguageLookup{$locale_with_full_names}{$lang};

        if ($script)
        {
            $lookup{en_script} = $ScriptLookup{$locale_with_full_names}{$script};

            $lookup{native_script} =
                ( exists $ScriptLookup{$lang}{$script}
                  ? $ScriptLookup{$lang}{$script}
                  : $ScriptLookup{$locale_with_full_names}{$script} );
        }

        if ($territory)
        {
            $lookup{en_territory} = $TerritoryLookup{$locale_with_full_names}{$territory};

            $lookup{native_territory} =
                ( exists $TerritoryLookup{$lang}{$territory}
                  ? $TerritoryLookup{$lang}{$territory}
                  : $TerritoryLookup{$locale_with_full_names}{$territory} );
        }

        if ($variant)
        {
            $lookup{en_variant} = ucfirst lc $variant;
        }
        else
        {
            $lookup{en_variant} = undef;
        }

        $lookup{native_variant} = $lookup{en_variant};

        print OUTFILE qq|    { id => "\Q$id\E",\n|;
        foreach my $k ( qw( en_language en_script en_territory en_variant
                            native_language native_script native_territory native_variant
                          )
                      )
        {
            next unless defined $lookup{$k};

            my $val = qq|"\Q$lookup{$k}\E"|;
            print OUTFILE "      $k => $val,\n";
        }

        if ( $LocalesWithoutData{$id} )
        {
            print OUTFILE qq|      real_class => "\Q$LocalesWithoutData{$id}\E",\n|;
        }

        print OUTFILE "    },\n";

        my @pieces;
        foreach my $p ( qw( en_language en_territory en_variant ) )
        {
            push @pieces, $lookup{$p} if defined $lookup{$p};
        }

        push @pieces, "($lookup{en_script})" if defined $lookup{en_script};

        $locales_in_pod .= sprintf( " %-18s  %s\n", $id, join ' ', @pieces );
    }

    print OUTFILE ");\n\n";

    my $aliases_in_pod = '';

    print OUTFILE "%DateTime::Locale::Aliases = (\n";
    foreach my $id ( sort keys %Aliases )
    {
        print OUTFILE "    $id => '$Aliases{$id}',\n";

        $aliases_in_pod .= sprintf( " %-18s  %s\n", $id, $Aliases{$id} );
    }
    print OUTFILE ");";

    print OUTFILE <<"EOF";


1;

__END__

=head1 NAME

DateTime::LocaleCatalog - Provides a list of all valid locale names

=head1 SYNOPSIS

See DateTime::Locale for usage details.

=head1 DESCRIPTION

This module contains a list of all known locales.

=head1 LOCALES

Any method taking locale id or name arguments should use one of the
values listed below.  Ids and names are case sensitive.

Always select the closest matching locale - for example, French
Canadians would choose fr_CA over fr - and B<always> use locale ids in
preference to names; locale ids offer greater compatibility when using
localized third party modules.

Many of the available locales are the same as other, more generic
locales for datetime information.  In that case, we simply load the
more generic class.  However, the various methods related to name,
territory, and variant return the values for the requested locale.

The available locales are:

 Locale id           Locale name
 ==================================================
$locales_in_pod

There are also some hard-coded aliases available, these are:

 Locale id           Is an alias for
 ==================================================
$aliases_in_pod

=cut
EOF

    close OUTFILE or warn $!;
}

sub usage
{
    print <<'EOF';

This script parses the ICU locale files and turns them into a set of
Perl modules.  It also generates the MANIFEST file.

It takes the following arguments:

  --dir      A directory containing ICU XML files.  Required.

  --file     Parse just the file with the given name.  For debugging.

  --name     Only create the specified locale.  For debugging.

  --quiet    Don't display any output while processing files.

  --verbose  Spew lots of output while processing.

  --help     What you are reading

If the --file or --name options are specified, the MANIFEST will not
be generated.

EOF

    exit;
}