xmlfilter   [plain text]


#!/usr/local/bin/perl -w
#
# $Revision: 1.1.1.1 $
#
# $Date: 2003/07/27 11:07:11 $

use XML::Parser;

my $Usage =<<'End_of_Usage;';
Usage is:
    xmlfilter [-h] [-nl] [{-+}root] [{-+}el=elname] [{-+}el:elnamepat]
              [{-+}att:attname] [{-+}att:attname:attvalpat] xmlfile

Prints on standard output the result of filtering the given xmlfile
for elements according to the switches. A '-' option will drop the
element from the output; a '+' will keep it. The output should also
be a well-formed XML document.

    -h		Print this message

    -nl         Emit a newline prior to every start tag.

    [-+]root	Drop (or keep) the root element. Defaults to keep.
		If the root element were named "foo", then -root
		would be equivalent to -el=foo. Note that even if
		you're dropping the root element, it's start and
		end tag are kept in order that the output remains
		a well-formed XML document.

    [-+]el=elname
		Drop (or keep) elements of type elname.

    [-+]el:elnamepat
		Drop (or keep) element whose type name matches elnamepat.

    [-+]att:attname
		Drop (or keep) elements which have an attribute = attname.

    [-+]att:attname:attvalpat
		Drop (or keep) elements which have an attribute = attname
		and for which the attribute value matches attvalpat.
End_of_Usage;

my $pass = 1;
my $do_newline = 0;

my $attcheck = 0;

my %drop_el;
my @drop_elpat;

my %keep_el;
my @keep_elpat;

my %drop_att;
my %keep_att;

my $always_true = sub {1;};
my $root_element = '';

my $in_cdata = 0;

# Process options

while (defined($ARGV[0]) and $ARGV[0] =~ /^[-+]/)
{
    my $opt = shift;

    if ($opt eq '-root')
    {
	$pass = 0;
    }
    elsif ($opt eq '+root')
    {
	$pass = 1;
    }
    elsif ($opt eq '-h')
    {
	print $Usage;
	exit;
    }
    elsif ($opt eq '-nl')
    {
	$do_newline = 1;
    }
    elsif ($opt =~ /^([-+])el([:=])(\S*)/)
    {
	my ($disp, $kind, $pattern) = ($1, $2, $3);
	my ($hashref, $aref);

	if ($disp eq '-')
	{
	    $hashref = \%drop_el;
	    $aref    = \@drop_elpat;
	}
	else
	{
	    $hashref = \%keep_el;
	    $aref    = \@keep_elpat;
	}

	if ($kind eq '=')
	{
	    $hashref->{$pattern} = 1;
	}
	else
	{
	    push(@$aref, $pattern);
	}
    }
    elsif ($opt =~ /^([-+])att:(\w+)(?::(\S*))?/)
    {
	my ($disp, $id, $pattern) = ($1, $2, $3);
	my $ref = ($disp eq '-') ? \%drop_att : \%keep_att;

	if (defined($pattern))
	{
	    $pattern =~ s!/!\\/!g;
	    my $sub;
	    eval "\$sub = sub {\$_[0] =~ /$pattern/;};";

	    $ref->{$id} = $sub;
	}
	else
	{
	    $ref->{$id} = $always_true;
	}

	$attcheck = 1;
    }
    else
    {
	die "Unknown option: $opt\n$Usage";
    }
}

my $drop_el_pattern = join('|', @drop_elpat);
my $keep_el_pattern = join('|', @keep_elpat);

my $drop_sub;
if ($drop_el_pattern)
{
    eval "\$drop_sub = sub {\$_[0] =~ /$drop_el_pattern/;}";
}
else
{
    $drop_sub = sub {};
}

my $keep_sub;
if ($keep_el_pattern)
{
    eval "\$keep_sub = sub {\$_[0] =~ /$keep_el_pattern/;}";
}
else
{
    $keep_sub = sub {};
}

my $doc = shift;

die "No file specified\n$Usage" unless defined($doc);

my @togglestack = ();

my $p = new XML::Parser(ErrorContext => 2,
			Handlers     => {Start => \&start_handler,
					 End   => \&end_handler
					 }
			);

if ($pass) {
  $p->setHandlers(Char       => \&char_handler,
		  CdataStart => \&cdata_start,
		  CdataEnd   => \&cdata_end);
}

$p->parsefile($doc);

print "</$root_element>\n"
    unless $pass;

################
## End of main
################

sub start_handler
{
    my $xp = shift;
    my $el = shift;

    unless ($root_element)
    {
	$root_element = $el;
	print "<$el>\n"
	    unless $pass;
    }

    my ($elref, $attref, $sub);

    if ($pass)
    {
	$elref = \%drop_el;
	$attref = \%drop_att;
	$sub = $drop_sub;
    }
    else
    {
	$elref = \%keep_el;
	$attref = \%keep_att;
	$sub = $keep_sub;
    }

    if (defined($elref->{$el})
	or &$sub($el)
	or check_atts($attref, @_))
    {
	$pass = ! $pass;
	if ($pass) {
	  $xp->setHandlers(Char       => \&char_handler,
			   CdataStart => \&cdata_start,
			   CdataEnd   => \&cdata_end);
	}
	else {
	  $xp->setHandlers(Char       => 0,
			   CdataStart => 0,
			   CdataEnd   => 0);
	}
	push(@togglestack, $xp->depth);
    }

    if ($pass)
    {
	print "\n" if $do_newline;
	print "<$el";
	while (@_)
	{
	    my $id = shift;
	    my $val = shift;

	    $val = $xp->xml_escape($val, "'");
	    print " $id='$val'";
	}
	print ">";
    }
}  # End start_handler

sub end_handler
{
    my $xp = shift;
    my $el = shift;

    if ($pass)
    {
	print "</$el>";
    }

    if (@togglestack and $togglestack[-1] == $xp->depth)
    {
	$pass = ! $pass;
	if ($pass) {
	  $xp->setHandlers(Char       => \&char_handler,
			   CdataStart => \&cdata_start,
			   CdataEnd   => \&cdata_end);
	}
	else {
	  $xp->setHandlers(Char       => 0,
			   CdataStart => 0,
			   CdataEnd   => 0);
	}

	pop(@togglestack);
    }

}  # End end_handler


sub char_handler
{
    my ($xp, $text) = @_;

    if (length($text)) {

      $text = $xp->xml_escape($text, '>')
	unless $in_cdata;

      print $text;
    }
}  # End char_handler

sub cdata_start {
  my $xp = shift;

  print '<![CDATA[';
  $in_cdata = 1;
}

sub cdata_end {
  my $xp = shift;

  print ']]>';
  $in_cdata = 0;
}

sub check_atts
{
    return $attcheck unless $attcheck;

    my $ref = shift;

    while (@_)
    {
	my $id = shift;
	my $val = shift;

	if (defined($ref->{$id}))
	{
	    my $ret = &{$ref->{$id}}($val);
	    return $ret if $ret;
	}
    }

    return 0;
}  # End check_atts

# Tell Emacs that this is really a perl script
# Local Variables:
# mode:perl
# End: