LDML.pm   [plain text]


package    # don't want this indexed by PAUSE
    LDML;

use strict;
use warnings;
use utf8;

use Data::Dumper;
use Lingua::EN::Inflect qw( PL_N );
use List::Util qw( first );
use Path::Class;
use Storable qw( nstore_fd fd_retrieve );
use XML::LibXML;

use Moose;
use Moose::Util::TypeConstraints;
use MooseX::ClassAttribute;

has 'id' => (
    is       => 'ro',
    isa      => 'Str',
    required => 1,
);

has 'source_file' => (
    is       => 'ro',
    isa      => 'Path::Class::File',
    required => 1,
);

has 'document' => (
    is       => 'ro',
    isa      => 'XML::LibXML::Document',
    required => 1,
    clearer  => '_clear_document',
);

class_has 'Aliases' => (
    is      => 'ro',
    isa     => 'HashRef',
    lazy    => 1,
    default => sub {
        return {
            'C'     => 'en_US_POSIX',
            'POSIX' => 'en_US_POSIX',

            # Apparently the Hebrew locale code was changed from iw to he at
            # one point.
            'iw'    => 'he',
            'iw_IL' => 'he_IL',

            # CLDR got rid of no
            'no'       => 'nn',
            'no_NO'    => 'nn_NO',
            'no_NO_NY' => 'nn_NO',
        };
    },
);

class_has 'FormatLengths' => (
    is      => 'ro',
    isa     => 'ArrayRef',
    lazy    => 1,
    default => sub { return [qw( full long medium short )] },
);

has 'version' => (
    is         => 'ro',
    isa        => 'Str',
    lazy_build => 1,
);

has 'generation_date' => (
    is         => 'ro',
    isa        => 'Str',
    lazy_build => 1,
);

has 'language' => (
    is      => 'ro',
    isa     => 'Str',
    lazy    => 1,
    default => sub { ( $_[0]->_parse_id() )[0] },
);

has 'script' => (
    is      => 'ro',
    isa     => 'Str|Undef',
    lazy    => 1,
    default => sub { ( $_[0]->_parse_id() )[1] },
);

has 'territory' => (
    is      => 'ro',
    isa     => 'Str|Undef',
    lazy    => 1,
    default => sub { ( $_[0]->_parse_id() )[2] },
);

has 'variant' => (
    is      => 'ro',
    isa     => 'Str|Undef',
    lazy    => 1,
    default => sub { ( $_[0]->_parse_id() )[3] },
);

has 'parent_id' => (
    is         => 'ro',
    isa        => 'Str',
    lazy_build => 1,
);

class_type 'XML::LibXML::Node';
has '_calendar_node' => (
    is      => 'ro',
    isa     => 'XML::LibXML::Node|Undef',
    lazy    => 1,
    default => sub {
        $_[0]->_find_one_node(q{dates/calendars/calendar[@type='gregorian']});
    },
);

has 'has_calendar_data' => (
    is      => 'ro',
    isa     => 'Bool',
    lazy    => 1,
    default => sub { $_[0]->_calendar_node() ? 1 : 0 },
);

for my $thing (
    {
        name   => 'day',
        length => 7,
        order  => [qw( mon tue wed thu fri sat sun )],
    }, {
        name   => 'month',
        length => 12,
        order  => [ 1 .. 12 ],
    }, {
        name   => 'quarter',
        length => 4,
        order  => [ 1 .. 4 ],
    },
    ) {
    for my $context (qw( format stand_alone )) {
        for my $size (qw( wide abbreviated narrow )) {
            my $name = $thing->{name};

            my $attr = $name . q{_} . $context . q{_} . $size;
            has $attr => (
                is         => 'ro',
                isa        => 'ArrayRef',
                lazy_build => 1,
            );

            my $required_length = $thing->{length};

            ( my $xml_context = $context ) =~ s/_/-/g;
            my $path = (
                join '/',
                PL_N($name),
                $name . 'Context' . q{[@type='} . $xml_context . q{']},
                $name . 'Width' . q{[@type='} . $size . q{']},
                $name
            );

            my $builder = sub {
                my $self = shift;

                return [] unless $self->has_calendar_data();

                my @vals = $self->_find_preferred_values(
                    ( scalar $self->_calendar_node()->findnodes($path) ),
                    'type',
                    $thing->{order},
                );

                return [] unless @vals == $thing->{length};

                return \@vals;
            };

            __PACKAGE__->meta()->add_method( '_build_' . $attr => $builder );
        }
    }
}

# eras have a different name scheme for sizes than other data
# elements, go figure.
for my $size ( [ wide => 'Names' ], [ abbreviated => 'Abbr' ],
    [ narrow => 'Narrow' ] ) {
    my $attr = 'era_' . $size->[0];

    has $attr => (
        is         => 'ro',
        isa        => 'ArrayRef',
        lazy_build => 1,
    );

    my $path = (
        join '/',
        'eras',
        'era' . $size->[1],
        'era',
    );

    my $builder = sub {
        my $self = shift;

        return [] unless $self->has_calendar_data();

        my @vals = $self->_find_preferred_values(
            ( scalar $self->_calendar_node()->findnodes($path) ),
            'type',
            [ 0, 1 ],
        );

        return [] unless @vals == 2;

        return \@vals;
    };

    __PACKAGE__->meta()->add_method( '_build_' . $attr => $builder );
}

for my $type (qw( date time )) {
    for my $length (qw( full long medium short )) {
        my $attr = $type . q{_format_} . $length;

        has $attr => (
            is         => 'ro',
            isa        => 'Str|Undef',
            lazy_build => 1,
        );

        my $path = (
            join '/',
            $type . 'Formats',
            $type . q{FormatLength[@type='} . $length . q{']},
            $type . 'Format',
            'pattern',
        );

        my $builder = sub {
            my $self = shift;

            return unless $self->has_calendar_data();

            return $self->_find_one_node_text( $path,
                $self->_calendar_node() );
        };

        __PACKAGE__->meta()->add_method( '_build_' . $attr => $builder );
    }
}

has 'default_date_format_length' => (
    is      => 'ro',
    isa     => 'Str|Undef',
    lazy    => 1,
    default => sub {
        $_[0]->_find_one_node_attribute(
            'dateFormats/default',
            $_[0]->_calendar_node(),
            'choice'
        );
    },
);

has 'default_time_format_length' => (
    is      => 'ro',
    isa     => 'Str|Undef',
    lazy    => 1,
    default => sub {
        $_[0]->_find_one_node_attribute(
            'timeFormats/default',
            $_[0]->_calendar_node(),
            'choice'
        );
    },
);

has 'am_pm_abbreviated' => (
    is         => 'ro',
    isa        => 'ArrayRef',
    lazy_build => 1,
);

has 'datetime_format' => (
    is         => 'ro',
    isa        => 'Str|Undef',
    lazy_build => 1,
);

has 'available_formats' => (
    is         => 'ro',
    isa        => 'HashRef[Str]',
    lazy_build => 1,
);

# This is really only built once for all objects
has '_first_day_of_week_index' => (
    is         => 'ro',
    isa        => 'HashRef',
    lazy_build => 1,
);

has 'first_day_of_week' => (
    is         => 'ro',
    isa        => 'Int',
    lazy_build => 1,
);

for my $thing (qw( language script territory variant )) {
    {
        my $en_attr = q{en_} . $thing;

        has $en_attr => (
            is         => 'ro',
            isa        => 'Str|Undef',
            lazy_build => 1,
        );

        my $en_ldml;
        my $builder = sub {
            my $self = shift;

            my $val_from_id = $self->$thing();
            return unless defined $val_from_id;

            $en_ldml
                ||= ( ref $self )
                ->new_from_file(
                $self->source_file()->dir()->file('en.xml') );

            my $path
                = 'localeDisplayNames/'
                . PL_N($thing) . q{/}
                . $thing
                . q{[@type='}
                . $self->$thing() . q{']};

            return $en_ldml->_find_one_node_text($path);
        };

        __PACKAGE__->meta()->add_method( '_build_' . $en_attr => $builder );
    }

    {
        my $native_attr = q{native_} . $thing;

        has $native_attr => (
            is         => 'ro',
            isa        => 'Str|Undef',
            lazy_build => 1,
        );

        my $builder = sub {
            my $self = shift;

            my $val_from_id = $self->$thing();
            return unless defined $val_from_id;

            my $path
                = 'localeDisplayNames/'
                . PL_N($thing) . q{/}
                . $thing
                . q{[@type='}
                . $self->$thing() . q{']};

            for ( my $ldml = $self; $ldml; $ldml = $ldml->_load_parent() ) {
                my $native_val = $ldml->_find_one_node_text($path);

                return $native_val if defined $native_val;
            }

            return;
        };

        __PACKAGE__->meta()
            ->add_method( '_build_' . $native_attr => $builder );
    }
}

sub _load_parent {
    my $self = shift;

    my $parent_id = $self->parent_id();
    return unless defined $parent_id;

    my $file = $self->source_file()->dir()->file( $parent_id . '.xml' );

    return unless -f $file;

    return ( ref $self )->new_from_file($file);
}

{
    my %Cache;

    sub new_from_file {
        my $class = shift;
        my $file  = file(shift);

        my $id = $file->basename();
        $id =~ s/\.xml$//i;

        return $Cache{$id}
            if $Cache{$id};

        my $doc = $class->_resolve_document_aliases($file);

        return $Cache{$id} = $class->new(
            id          => $id,
            source_file => $file,
            document    => $doc,
        );
    }
}

{
    my $Parser = XML::LibXML->new();
    $Parser->load_catalog('/etc/xml/catalog.xml');
    $Parser->load_ext_dtd(0);

    sub _resolve_document_aliases {
        my $class = shift;
        my $file  = shift;

        my $doc = $Parser->parse_file( $file->stringify() );

        $class->_resolve_aliases_in_node( $doc->documentElement(), $file );

        return $doc;
    }
}

sub _resolve_aliases_in_node {
    my $class = shift;
    my $node  = shift;
    my $file  = shift;

ALIAS:
    for my $node ( $node->getElementsByTagName('alias') ) {

        # Replacing all the aliases is slow, and we really don't care
        # about most of the data in the file, just the
        # localeDisplayNames and the gregorian calendar.
        #
        # We also end up skipping the case where the entire locale is an alias to some
        # other locale. This is handled in the generated Perl code.
        for ( my $p = $node->parentNode(); $p; $p = $p->parentNode() ) {
            if ( $p->nodeName() eq 'calendar' ) {
                if ( $p->getAttribute('type') eq 'gregorian' ) {
                    last;
                }
                else {
                    next ALIAS;
                }
            }

            last if $p->nodeName() eq 'localeDisplayNames';

            next ALIAS if $p->nodeName() eq 'ldml';
            next ALIAS if $p->nodeName() eq '#document';
        }

        $class->_resolve_alias( $node, $file );
    }
}

sub _resolve_alias {
    my $class = shift;
    my $node  = shift;
    my $file  = shift;

    my $source = $node->getAttribute('source')
        or die "Alias with no source in $file";

    if ( $source eq 'locale' ) {
        $class->_resolve_local_alias( $node, $file );
    }
    else {
        $class->_resolve_remote_alias( $node, $file );
    }
}

sub _resolve_local_alias {
    my $class = shift;
    my $node  = shift;
    my $file  = shift;

    my $path = $node->getAttribute('path');

    # The path resolves from the context of the parent node, not the
    # current node. Why? Why not?
    $class->_replace_alias_with_path( $node, $path, $node->parentNode(),
        $file );
}

sub _resolve_remote_alias {
    my $class = shift;
    my $node  = shift;
    my $file  = shift;

    my $source      = $node->getAttribute('source');
    my $target_file = $file->dir()->file( $source . q{.xml} );

    my $doc = $class->_resolve_document_aliases($target_file);

    # I'm not sure nodePath() will work, since it seems to return an
    # array-based index like /ldml/dates/calendars/calendar[4]. I'm
    # not sure if LDML allows this, but the target file might contain
    # a different ordering or may just be missing something. This
    # whole alias thing is madness.
    #
    # However, remote aliases seem to be a rare case outside of an
    # alias for the entire file, so they can be investigated as
    # needed.
    my $path = $node->getAttribute('path') || $node->parentNode()->nodePath();

    $class->_replace_alias_with_path( $node, $path, $doc, $target_file );
}

sub _replace_alias_with_path {
    my $class   = shift;
    my $node    = shift;
    my $path    = shift;
    my $context = shift;
    my $file    = shift;

    my @targets = $context->findnodes($path);

    my $line = $node->line_number();
    die "Path ($path) resolves to multiple nodes in $file (line $line)"
        if @targets > 1;

    die "Path ($path) does not resolve to any node in $file (line $line)"
        if @targets == 0;

    my $parent = $node->parentNode();

    $parent->removeChildNodes();
    $parent->appendChild( $_->cloneNode(1) ) for $targets[0]->childNodes();

    # This means the same things get resolved multiple times, but it's
    # pretty fast with LibXML, and simpler to code than something more
    # efficient.
    $class->_resolve_aliases_in_node( $parent, $file );
}

sub BUILD {
    my $self = shift;

    my $meth = q{_} . $self->id() . q{_hack};

    # This gives us a chance to apply bug fixes to the data as needed.
    $self->$meth()
        if $self->can($meth);

    return $self;
}

sub _az_hack {
    my $self = shift;
    my $data = shift;

    # The az.xml file appears to have a mistake in the wide day names,
    # thursday and friday are the same for this locale

    my $thu = $self->_find_one_node_text(
        q{days/dayContext[@type='format']/dayWidth[@type='wide']/day[@type='thu']},
        $self->_calendar_node()
    );

    my $fri = $self->_find_one_node(
        q{days/dayContext[@type='format']/dayWidth[@type='wide']/day[@type='fri']},
        $self->_calendar_node()
    );

    $fri->removeChildNodes();

    $thu =~ s/ \w+$//;
    $fri->appendChild( $self->document()->createTextNode($thu) );
}

sub _gaa_hack {
    my $self = shift;
    my $data = shift;

    my $path
        = q{days/dayContext[@type='format']/dayWidth[@type='abbreviated']/day[@type='sun']};

    my $day_text
        = $self->_find_one_node_text( $path, $self->_calendar_node() );

    return unless $day_text eq 'Ho';

    # I am completely making this up, but the data is marked as
    # unconfirmed in the locale file and making something up is
    # preferable to having two days with the same abbreviation

    my $day = $self->_find_one_node( $path, $self->_calendar_node() );

    $day->removeChildNodes();
    $day->appendChild( $self->document()->createTextNode('Hog') );
}

sub _ve_hack {
    my $self = shift;
    my $data = shift;

    my $path
        = q{months/monthContext[@type='format']/monthWidth[@type='abbreviated']/month[@type='3']};

    my $day_text
        = $self->_find_one_node_text( $path, $self->_calendar_node() );

    return unless $day_text eq 'Ṱha';

    # Again, making stuff up to avoid non-unique abbreviations

    my $day = $self->_find_one_node( $path, $self->_calendar_node() );

    $day->removeChildNodes();
    $day->appendChild( $self->document()->createTextNode('Ṱhf') );
}

sub _build_version {
    my $self = shift;

    my $version
        = $self->_find_one_node_attribute( 'identity/version', 'number' );
    $version =~ s/^\$Revision:\s+//;
    $version =~ s/\s+\$$//;

    return $version;
}

sub _build_generation_date {
    my $self = shift;

    my $date
        = $self->_find_one_node_attribute( 'identity/generation', 'date' );
    $date =~ s/^\$Date:\s+//;
    $date =~ s/\s+\$$//;

    return $date;
}

sub _parse_id {
    my $self = shift;

    return $self->id() =~ /([a-z]+)               # language
                        (?: _([A-Z][a-z]+) )?  # script - Title Case - optional
                        (?: _([A-Z]+) )?       # territory - ALL CAPS - optional
                        (?: _([A-Z]+) )?       # variant - ALL CAPS - optional
                       /x;
}

sub _build_parent_id {
    my $self = shift;

    my $source = $self->_find_one_node_attribute( 'alias', 'source' );
    return $source if defined $source;

    my @parts = (
        grep {defined} $self->language(),
        $self->script(),
        $self->territory(),
        $self->variant(),
    );

    pop @parts;

    if (@parts) {
        return join '_', @parts;
    }
    else {
        return $self->id() eq 'root' ? 'Base' : 'root';
    }
}

sub _build_am_pm_abbreviated {
    my $self = shift;

    my $am = $self->_find_one_node_text( 'am', $self->_calendar_node() );
    my $pm = $self->_find_one_node_text( 'pm', $self->_calendar_node() );

    return [] unless defined $am && defined $pm;

    return [ $am, $pm ];
}

sub _build_datetime_format {
    my $self = shift;

    return $self->_find_one_node_text(
        'dateTimeFormats/dateTimeFormatLength/dateTimeFormat/pattern',
        $self->_calendar_node()
    );
}

sub _build_available_formats {
    my $self = shift;

    return {} unless $self->has_calendar_data();

    my @nodes = $self->_calendar_node()
        ->findnodes('dateTimeFormats/availableFormats/dateFormatItem');

    my %index;
    for my $node (@nodes) {
        push @{ $index{ $node->getAttribute('id') } }, $node;
    }

    my %formats;
    for my $id ( keys %index ) {
        my $preferred = $self->_find_preferred_node( @{ $index{$id} } )
            or next;

        $formats{$id} = join '', map { $_->data() } $preferred->childNodes();
    }

    return \%formats;
}

sub _build_first_day_of_week {
    my $self = shift;

    my $terr = $self->territory();
    return 1 unless defined $terr;

    my $index = $self->_first_day_of_week_index();

    return $index->{$terr} || 1;
}

sub _find_preferred_values {
    my $self  = shift;
    my $nodes = shift;
    my $attr  = shift;
    my $order = shift;

    my @nodes = $nodes->get_nodelist();

    return [] unless @nodes;

    my %index;

    for my $node (@nodes) {
        push @{ $index{ $node->getAttribute($attr) } }, $node;
    }

    my @preferred;
    for my $attr ( @{$order} ) {

        # There may be nothing in the index for incomplete sets (of
        # days, months, etc)
        my @matches = @{ $index{$attr} || [] };

        my $preferred = $self->_find_preferred_node(@matches)
            or next;

        push @preferred, join '', map { $_->data() } $preferred->childNodes();
    }

    return @preferred;
}

sub _find_preferred_node {
    my $self  = shift;
    my @nodes = @_;

    return unless @nodes;

    return $nodes[0] if @nodes == 1;

    my $non_draft = first { !$_->getAttribute('draft') } @nodes;

    return $non_draft if $non_draft;

    return $nodes[0];
}

sub _find_one_node_text {
    my $self = shift;

    my $node = $self->_find_one_node(@_);

    return unless $node;

    return join '', map { $_->data() } $node->childNodes();
}

sub _find_one_node_attribute {
    my $self = shift;

    # attr name will always be last
    my $attr = pop;

    my $node = $self->_find_one_node(@_);

    return unless $node;

    return $node->getAttribute($attr);
}

sub _find_one_node {
    my $self    = shift;
    my $path    = shift;
    my $context = shift || $self->document()->documentElement();

    my @nodes = $self->_find_preferred_node( $context->findnodes($path) );

    if ( @nodes > 1 ) {
        my $context_path = $context->nodePath();

        die "Found multiple nodes for $path under $context_path";
    }

    return $nodes[0];
}

{
    my %days = do {
        my $x = 1;
        map { $_ => $x++ } qw( mon tue wed thu fri sat sun );
    };

    my %index;

    my $file_name = 'supplementalData.xml';

    sub _build__first_day_of_week_index {
        return \%index
            if keys %index;

        my $self = shift;

        my $file;
        for my $dir (
            $self->source_file()->dir(),
            $self->source_file()->dir()->parent()->subdir('supplemental'),
            ) {
            $file = $dir->file($file_name);

            last if -f $file;
        }

        die "Cannot find $file_name"
            unless -f $file;

        my $doc = XML::LibXML->new()->parse_file( $file->stringify() );

        my @nodes = $doc->findnodes('supplementalData/weekData/firstDay');

        for my $node (@nodes) {
            my $day_num = $days{ $node->getAttribute('day') };

            $index{$_} = $day_num
                for split /\s+/, $node->getAttribute('territories');
        }

        return \%index;
    }
}

__PACKAGE__->meta()->make_immutable();
no Moose;
no Moose::Util::TypeConstraints;

1;