UCD.pm   [plain text]


# $Id: UCD.pm,v 1.1 2003/06/04 00:27:53 marka Exp $
#
# Copyright (c) 2000,2001 Japan Network Information Center.
# All rights reserved.
#  
# By using this file, you agree to the terms and conditions set forth bellow.
# 
# 			LICENSE TERMS AND CONDITIONS 
# 
# The following License Terms and Conditions apply, unless a different
# license is obtained from Japan Network Information Center ("JPNIC"),
# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
# Chiyoda-ku, Tokyo 101-0047, Japan.
# 
# 1. Use, Modification and Redistribution (including distribution of any
#    modified or derived work) in source and/or binary forms is permitted
#    under this License Terms and Conditions.
# 
# 2. Redistribution of source code must retain the copyright notices as they
#    appear in each source code file, this License Terms and Conditions.
# 
# 3. Redistribution in binary form must reproduce the Copyright Notice,
#    this License Terms and Conditions, in the documentation and/or other
#    materials provided with the distribution.  For the purposes of binary
#    distribution the "Copyright Notice" refers to the following language:
#    "Copyright (c) 2000-2002 Japan Network Information Center.  All rights reserved."
# 
# 4. The name of JPNIC may not be used to endorse or promote products
#    derived from this Software without specific prior written approval of
#    JPNIC.
# 
# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
#    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
#    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
#    PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL JPNIC BE LIABLE
#    FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
#    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
#    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
#    BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
#    WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
#    OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
#    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
#

package UCD;

#
# UCD.pm -- parser for Unicode Character Database files.
#
# This file is an aggregation of the following modules, each of which
# provides a parser for a specific data file of UCD.
#	UCD::UnicodeData		-- for UnicodeData.txt
#	UCD::CaseFolding		-- for CaseFolding.txt
#	UCD::SpecialCasing		-- for SpecialCasing.txt
#	UCD::CompositionExclusions	-- for CompositionExclusions-1.txt
#
# Each module provides two subroutines:
#
#   $line = getline(\*HANDLE);
#	reads next non-comment line from HANDLE, and returns it.
#	undef will be returned upon EOF.
#
#   %fields = parse($line);
#	parses a line and extract fields, and returns a list of
#	field name and its value, suitable for assignment to a hash.
#

package UCD::UnicodeData;

use strict;
use Carp;

sub getline {
    my $fh = shift;
    my $s = <$fh>;
    $s =~ s/\r?\n$// if $s;
    $s;
}

sub parseline {
    my $s = shift;

    my @f = split /;/, $s, -1;
    return (CODE     => hex($f[0]),
	    NAME     => $f[1],
	    CATEGORY => $f[2],
	    CLASS    => $f[3]+0,
	    BIDI     => $f[4],
	    DECOMP   => dcmap($f[5]),
	    DECIMAL  => dvalue($f[6]),
	    DIGIT    => dvalue($f[7]),
	    NUMERIC  => dvalue($f[8]),
	    MIRRORED => $f[9] eq 'Y',
	    NAME10   => $f[10],
	    COMMENT  => $f[11],
	    UPPER    => ucode($f[12]),
	    LOWER    => ucode($f[13]),
	    TITLE    => ucode($f[14]));
}

sub dcmap {
    my $v = shift;
    return undef if $v eq '';
    $v =~ /^(?:(<[^>]+>)\s*)?(\S.*)/
	or croak "invalid decomposition mapping \"$v\"";
    my $tag = $1 || '';
    [$tag, map {hex($_)} split(' ', $2)];
}

sub ucode {
    my $v = shift;
    return undef if $v eq '';
    hex($v);
}

sub dvalue {
    my $v = shift;
    return undef if $v eq '';
    $v;
}

package UCD::CaseFolding;

use strict;

sub getline {
    my $fh = shift;
    while (defined(my $s = <$fh>)) {
	next if $s =~ /^\#/;
	next if $s =~ /^\s*$/;
	$s =~ s/\r?\n$//;
	return $s;
    }
    undef;
}

sub parseline {
    my $s = shift;
    my @f = split /;\s*/, $s, -1;
    return (CODE => hex($f[0]),
	    TYPE => $f[1],
	    MAP  => [map(hex, split ' ', $f[2])],
	   );
}

package UCD::SpecialCasing;

use strict;

sub getline {
    my $fh = shift;
    while (defined(my $s = <$fh>)) {
	next if $s =~ /^\#/;
	next if $s =~ /^\s*$/;
	$s =~ s/\r?\n$//;
	return $s;
    }
    undef;
}

sub parseline {
    my $s = shift;

    my @f = split /;\s*/, $s, -1;
    my $cond = (@f > 5) ? $f[4] : undef;
    return (CODE => hex($f[0]),
	    LOWER => [map(hex, split ' ', $f[1])],
	    TITLE => [map(hex, split ' ', $f[2])],
	    UPPER => [map(hex, split ' ', $f[3])],
	    CONDITION => $cond);
}

package UCD::CompositionExclusions;

use strict;

sub getline {
    my $fh = shift;
    while (defined(my $s = <$fh>)) {
	next if $s =~ /^\#/;
	next if $s =~ /^\s*$/;
	$s =~ s/\r?\n$//;
	return $s;
    }
    undef;
}

sub parseline {
    my $s = shift;
    m/^[0-9A-Fa-f]+/;
    return (CODE => hex($&));
}

1;