intltool-extract.in [plain text]

#!@INTLTOOL_PERL@ -w 
# -*- Mode: perl; indent-tabs-mode: nil; c-basic-offset: 4  -*-

#
#  The Intltool Message Extractor
#
#  Copyright (C) 2000-2001, 2003 Free Software Foundation.
#
#  Intltool is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License as
#  published by the Free Software Foundation; either version 2 of the
#  License, or (at your option) any later version.
#
#  Intltool is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
#  As a special exception to the GNU General Public License, if you
#  distribute this file as part of a program that contains a
#  configuration script generated by Autoconf, you may include it under
#  the same distribution terms that you use for the rest of that program.
#
#  Authors: Kenneth Christiansen <kenneth@gnu.org>
#           Darin Adler <darin@bentspoon.com>
#

## Release information
my $PROGRAM      = "intltool-extract";
my $PACKAGE      = "intltool";
my $VERSION      = "0.33";

## Loaded modules
use strict; 
use File::Basename;
use Getopt::Long;

## Scalars used by the option stuff
my $TYPE_ARG	= "0";
my $LOCAL_ARG	= "0";
my $HELP_ARG 	= "0";
my $VERSION_ARG = "0";
my $UPDATE_ARG  = "0";
my $QUIET_ARG   = "0";
my $SRCDIR_ARG	= ".";

my $FILE;
my $OUTFILE;

my $gettext_type = "";
my $input;
my %messages = ();
my %loc = ();
my %count = ();
my %comments = ();
my $strcount = 0;

## Use this instead of \w for XML files to handle more possible characters.
my $w = "[-A-Za-z0-9._:]";

## Always print first
$| = 1;

## Handle options
GetOptions (
	    "type=s"     => \$TYPE_ARG,
            "local|l"    => \$LOCAL_ARG,
            "help|h"     => \$HELP_ARG,
            "version|v"  => \$VERSION_ARG,
            "update"     => \$UPDATE_ARG,
	    "quiet|q"    => \$QUIET_ARG,
	    "srcdir=s"	 => \$SRCDIR_ARG,
            ) or &error;

&split_on_argument;


## Check for options. 
## This section will check for the different options.

sub split_on_argument {

    if ($VERSION_ARG) {
        &version;

    } elsif ($HELP_ARG) {
	&help;
        
    } elsif ($LOCAL_ARG) {
        &place_local;
        &extract;

    } elsif ($UPDATE_ARG) {
	&place_normal;
	&extract;

    } elsif (@ARGV > 0) {
	&place_normal;
	&message;
	&extract;

    } else {
	&help;

    }  
}    

sub place_normal {
    $FILE	 = $ARGV[0];
    $OUTFILE     = "$FILE.h";
}   

sub place_local {
    $FILE	 = $ARGV[0];
    $OUTFILE     = fileparse($FILE, ());
    if (!-e "tmp/") { 
        system("mkdir tmp/"); 
    }
    $OUTFILE     = "./tmp/$OUTFILE.h"
}

sub determine_type {
   if ($TYPE_ARG =~ /^gettext\/(.*)/) {
	$gettext_type=$1
   }
}

## Sub for printing release information
sub version{
    print <<_EOF_;
${PROGRAM} (${PACKAGE}) $VERSION
Copyright (C) 2000, 2003 Free Software Foundation, Inc.
Written by Kenneth Christiansen, 2000.

This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
_EOF_
    exit;
}

## Sub for printing usage information
sub help {
    print <<_EOF_;
Usage: ${PROGRAM} [OPTION]... [FILENAME]
Generates a header file from an XML source file.

It grabs all strings between <_translatable_node> and its end tag in
XML files. Read manpage (man ${PROGRAM}) for more info.

      --type=TYPE   Specify the file type of FILENAME. Currently supports:
                    "gettext/glade", "gettext/ini", "gettext/keys"
                    "gettext/rfc822deb", "gettext/schemas",
                    "gettext/scheme", "gettext/xml"
  -l, --local       Writes output into current working directory
                    (conflicts with --update)
      --update      Writes output into the same directory the source file 
                    reside (conflicts with --local)
      --srcdir      Root of the source tree
  -v, --version     Output version information and exit
  -h, --help        Display this help and exit
  -q, --quiet       Quiet mode

Report bugs to http://bugzilla.gnome.org/ (product name "$PACKAGE")
or send email to <xml-i18n-tools\@gnome.org>.
_EOF_
    exit;
}

## Sub for printing error messages
sub error{
    print STDERR "Try `${PROGRAM} --help' for more information.\n";
    exit;
}

sub message {
    print "Generating C format header file for translation.\n" unless $QUIET_ARG;
}

sub extract {
    &determine_type;

    &convert;

    open OUT, ">$OUTFILE";
    &msg_write;
    close OUT;

    print "Wrote $OUTFILE\n" unless $QUIET_ARG;
}

sub convert {

    ## Reading the file
    {
	local (*IN);
	local $/; #slurp mode
	open (IN, "<$SRCDIR_ARG/$FILE") || die "can't open $SRCDIR_ARG/$FILE: $!";
	$input = <IN>;
    }

    &type_ini if $gettext_type eq "ini";
    &type_keys if $gettext_type eq "keys";
    &type_xml if $gettext_type eq "xml";
    &type_glade if $gettext_type eq "glade";
    &type_scheme if $gettext_type eq "scheme";
    &type_schemas  if $gettext_type eq "schemas";
    &type_rfc822deb  if $gettext_type eq "rfc822deb";
}

sub entity_decode_minimal
{
    local ($_) = @_;

    s/&apos;/'/g; # '
    s/&quot;/"/g; # "
    s/&amp;/&/g;

    return $_;
}

sub entity_decode
{
    local ($_) = @_;

    s/&apos;/'/g; # '
    s/&quot;/"/g; # "
    s/&amp;/&/g;
    s/&lt;/</g;
    s/&gt;/>/g;

    return $_;
}

sub escape_char
{
    return '\"' if $_ eq '"';
    return '\n' if $_ eq "\n";
    return '\\' if $_ eq '\\';

    return $_;
}

sub escape
{
    my ($string) = @_;
    return join "", map &escape_char, split //, $string;
}

sub type_ini {
    ### For generic translatable desktop files ###
    while ($input =~ /^_.*=(.*)$/mg) {
        $messages{$1} = [];
    }
}

sub type_keys {
    ### For generic translatable mime/keys files ###
    while ($input =~ /^\s*_\w+=(.*)$/mg) {
        $messages{$1} = [];
    }
}

sub type_xml {
    ### For generic translatable XML files ###
        
    while ($input =~ /(?:<!--([^>]*?)-->[^\n]*\n?[^\n]*)?\s_$w+\s*=\s*\"([^"]*)\"/sg) { # "
        $messages{entity_decode_minimal($2)} = [];
        $comments{entity_decode_minimal($2)} = $1 if (defined($1));
    }

    while ($input =~ /(?:<!--([^>]*?)-->\s*)?<_($w+)(?: xml:space="($w+)")?[^>]*>(.*?)<\/_\2>/sg) {
        $_ = $4;
        if (!defined($3) || $3 ne "preserve") {
            s/\s+/ /g;
            s/^ //;
            s/ $//;
        }
        $messages{$_} = [];
        $comments{$_} = $1 if (defined($1));
    }
}

sub type_schemas {
    ### For schemas XML files ###
         
    # FIXME: We should handle escaped < (less than)
    while ($input =~ /
                      <locale\ name="C">\s*
                          (<default>\s*(?:<!--([^>]*?)-->\s*)?(.*?)\s*<\/default>\s*)?
                          (<short>\s*(?:<!--([^>]*?)-->\s*)?(.*?)\s*<\/short>\s*)?
                          (<long>\s*(?:<!--([^>]*?)-->\s*)?(.*?)\s*<\/long>\s*)?
                      <\/locale>
                     /sgx) {
        my @totranslate = ($3,$6,$9);
        my @eachcomment = ($2,$5,$8);
        foreach (@totranslate) {
            my $currentcomment = shift @eachcomment;
            next if !$_;
            s/\s+/ /g;
            $messages{entity_decode_minimal($_)} = [];
            $comments{entity_decode_minimal($_)} = $currentcomment if (defined($currentcomment));
        }
    }
}

sub type_rfc822deb {
    ### For rfc822-style Debian configuration files ###

    my $lineno = 1;
    my $type = '';
    while ($input =~ /\G(.*?)(^|\n)(_+)([^:]+):[ \t]*(.*?)(?=\n\S|$)/sg)
    {
        my ($pre, $newline, $underscore, $tag, $text) = ($1, $2, $3, $4, $5);
        while ($pre =~ m/\n/g)
        {
            $lineno ++;
        }
        $lineno += length($newline);
        my @str_list = rfc822deb_split(length($underscore), $text);
        for my $str (@str_list)
        {
            $strcount++;
            $messages{$str} = [];
            $loc{$str} = $lineno;
            $count{$str} = $strcount;
            my $usercomment = '';
            while($pre =~ s/(^|\n)#([^\n]*)$//s)
            {
                $usercomment = "\n" . $2 . $usercomment;
            }
            $comments{$str} = $tag . $usercomment;
        }
        $lineno += ($text =~ s/\n//g);
    }
}

sub rfc822deb_split {
    # Debian defines a special way to deal with rfc822-style files:
    # when a value contain newlines, it consists of
    #   1.  a short form (first line)
    #   2.  a long description, all lines begin with a space,
    #       and paragraphs are separated by a single dot on a line
    # This routine returns an array of all paragraphs, and reformat
    # them.
    # When first argument is 2, the string is a comma separated list of
    # values.
    my $type = shift;
    my $text = shift;
    $text =~ s/^[ \t]//mg;
    return (split(/, */, $text, 0)) if $type ne 1;
    return ($text) if $text !~ /\n/;

    $text =~ s/([^\n]*)\n//;
    my @list = ($1);
    my $str = '';
    for my $line (split (/\n/, $text))
    {
        chomp $line;
        if ($line =~ /^\.\s*$/)
        {
            #  New paragraph
            $str =~ s/\s*$//;
            push(@list, $str);
            $str = '';
        }
        elsif ($line =~ /^\s/)
        {
            #  Line which must not be reformatted
            $str .= "\n" if length ($str) && $str !~ /\n$/;
            $line =~ s/\s+$//;
            $str .= $line."\n";
        }
        else
        {
            #  Continuation line, remove newline
            $str .= " " if length ($str) && $str !~ /\n$/;
            $str .= $line;
        }
    }
    $str =~ s/\s*$//;
    push(@list, $str) if length ($str);
    return @list;
}

sub type_glade {
    ### For translatable Glade XML files ###

    my $tags = "label|title|text|format|copyright|comments|preview_text|tooltip|message";

    while ($input =~ /<($tags)>([^<]+)<\/($tags)>/sg) {
	# Glade sometimes uses tags that normally mark translatable things for
        # little bits of non-translatable content. We work around this by not
        # translating strings that only includes something like label4 or window1.
	$messages{entity_decode($2)} = [] unless $2 =~ /^(window|label|dialog)[0-9]+$/;
    }
    
    while ($input =~ /<items>(..[^<]*)<\/items>/sg) {
	for my $item (split (/\n/, $1)) {
	    $messages{entity_decode($item)} = [];
	}
    }

    ## handle new glade files
    while ($input =~ /<(property|atkproperty)\s+[^>]*translatable\s*=\s*"yes"(?:\s+[^>]*comments\s*=\s*"([^"]*)")?[^>]*>([^<]+)<\/\1>/sg) {
	$messages{entity_decode($3)} = [] unless $3 =~ /^(window|label)[0-9]+$/;
        if (defined($2) and !($3 =~ /^(window|label)[0-9]+$/)) {
	   $comments{entity_decode($3)} = entity_decode($2) ;
        }
    }
    while ($input =~ /<atkaction\s+action_name="([^>]*)"\s+description="([^>]+)"\/>/sg) {
        $messages{entity_decode_minimal($2)} = [];
    }
}

sub type_scheme {
    my ($line, $i, $state, $str, $trcomment, $char);
    for $line (split(/\n/, $input)) {
        $i = 0;
        $state = 0; # 0 - nothing, 1 - string, 2 - translatable string
        while ($i < length($line)) {
            if (substr($line,$i,1) eq "\"") {
                if ($state == 2) {
                    $comments{$str} = $trcomment if ($trcomment);
                    $messages{$str} = [];
                    $str = '';
                    $state = 0; $trcomment = "";
                } elsif ($state == 1) {
                    $str = '';
                    $state = 0; $trcomment = "";
                } else {
                    $state = 1;
                    $str = '';
                    if ($i>0 && substr($line,$i-1,1) eq '_') {
                        $state = 2;
                    }
                }
            } elsif (!$state) {
                if (substr($line,$i,1) eq ";") {
                    $trcomment = substr($line,$i+1);
                    $trcomment =~ s/^;*\s*//;
                    $i = length($line);
                } elsif ($trcomment && substr($line,$i,1) !~ /\s|\(|\)|_/) {
                    $trcomment = "";
                }
            } else {
                if (substr($line,$i,1) eq "\\") {
                    $char = substr($line,$i+1,1);
                    if ($char ne "\"" && $char ne "\\") {
                       $str = $str . "\\";
                    }
                    $i++;
                }
                $str = $str . substr($line,$i,1);
            }
            $i++;
        }
    }
}

sub msg_write {
    my @msgids;
    if (%count)
    {
        @msgids = sort { $count{$a} <=> $count{$b} } keys %count;
    }
    else
    {
        @msgids = sort keys %messages;
    }
    for my $message (@msgids)
    {
	my $offsetlines = 1;
	$offsetlines++ if $message =~ /%/;
	if (defined ($comments{$message}))
	{
		while ($comments{$message} =~ m/\n/g)
		{
		    $offsetlines++;
		}
	}
	print OUT "# ".($loc{$message} - $offsetlines).  " \"$FILE\"\n"
	        if defined $loc{$message};
   	print OUT "/* ".$comments{$message}." */\n"
                if defined $comments{$message};
   	print OUT "/* xgettext:no-c-format */\n" if $message =~ /%/;
        
    	my @lines = split (/\n/, $message, -1);
    	for (my $n = 0; $n < @lines; $n++)
	{
            if ($n == 0)
            {
 		print OUT "char *s = N_(\""; 
            }
            else
            {  
                print OUT "             \""; 
            }

            print OUT escape($lines[$n]);

            if ($n < @lines - 1)
            {
                print OUT "\\n\"\n"; 
            }
            else
            {
                print OUT "\");\n";  
	    }
        }
    }
}