parse-rules-for-masses   [plain text]


#!/usr/bin/perl -w
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use strict;

sub usage {
  die "
parse-rules-for-masses: parse the SpamAssassin rules files for mass-checks,
        evolving, and frequency analysis

usage: ./parse-rules-for-masses [-d rulesdir] [-o outputfile] [-s scoreset] [-x]

rulesdir defaults to ../rules
outputfile defaults to ./tmp/rules.pl
scoreset default to 0
-x do not include test rules files (ie 70_*)
";
}

use Getopt::Long;
use Data::Dumper;

use vars qw(@rulesdirs $outputfile $scoreset $skip_test_rules);
GetOptions (
                "d=s" => \@rulesdirs,
                "o=s" => \$outputfile,
		"s=i" => \$scoreset,
	        "x" => \$skip_test_rules,
                "help|h|?" => sub { usage(); } );

if ($#rulesdirs < 0) {
  @rulesdirs = ("../rules");
}

if (!defined $outputfile) {
  $outputfile = "./tmp/rules.pl";
  mkdir ("tmp", 0755);
}

$scoreset = 0 if ( !defined $scoreset );

my $rules = { };
my $reuse = { };
$rules->{_scoreset} = $scoreset;
readrules(@rulesdirs);

my $scores = { };
foreach my $key (keys %{$rules}) {
  next if $key eq '_scoreset';
  $scores->{$key} = $rules->{$key}->{score};
}

writerules($outputfile);
exit;

sub readrules {
  foreach my $indir (@_) {
    my @files = <$indir/*.cf>;

    my $file;
    my $scores_mutable = 1;
    my %rulesfound = ();
    my %langs = ();
    foreach $file (sort @files) {
      $scores_mutable = 1;      # always start off mutable in each file
      if ($skip_test_rules) {
	next if ($file =~ /70_/);
      }
      open (IN, "<$file");
      while (<IN>)
      {
        # these appear in comments, so deal with them before comment stripping
        # takes place
        if (/<\/gen:mutable>/i) {
          $scores_mutable = 0;
        }
        elsif (/<gen:mutable>/i) {
          $scores_mutable = 1;
        }

	# oh, this is a dirty dirty hack, but we don't need this at runtime
	s/^#reuse/reuse/;

        s/#.*$//g; s/^\s+//; s/\s+$//; next if /^$/;

	# TODO: this could be overwriting stuff
        my $lang = '';
        if (s/^lang\s+(\S+)\s+//) {
          $lang = $1;
        }

        if (/^(header|rawbody|body|full|uri|meta|mimeheader)\s+(\S+)\s+(.*)$/) {
          my $type = $1;
          my $name = $2;
          my $val = $3;

          $rules->{$name} ||= { };
          $rules->{$name}->{type} = $type;
          $rules->{$name}->{lang} = $lang;
          $rules->{$name}->{issubrule} = ($name =~ /^__/) ? '1' : '0';
          $rules->{$name}->{tflags} = '';
          $rules->{$name}->{eval} = ($val =~ /\beval:(\w+)/) ? $1 : '0';
          if ($type eq "meta") {
            my @depends = grep { !/^\d+$/ } ($val =~ m/(\w+)/g);
            push(@{ $rules->{$name}->{depends} }, @depends);
          }
          $rules->{$name}->{code} = $val;

        } elsif (/^describe\s+(\S+)\s+(.+)$/) {
          $rules->{$1} ||= { };
	  if ($lang) {
	    $rules->{$1}->{describe} ||= $2;
	  }
	  else {
	    $rules->{$1}->{describe} = $2;
	  }
        } elsif (/^tflags\s+(\S+)\s+(.+)$/) {
          $rules->{$1} ||= { };
          $rules->{$1}->{tflags} = $2;

        } elsif (/^score\s+(\S+)\s+(.+)$/) {
	  my($name,$score) = ($1,$2);
          $rules->{$name} ||= { };
	  if ( $score =~ /\s/ ) { # there are multiple scores
	    ($score) = (split(/\s+/,$score))[$scoreset];
	  }
          $rules->{$name}->{score} = $score;
          $rules->{$name}->{mutable} = $scores_mutable;

        } elsif (/^reuse\s+(.*)$/) {
	  my ($new, @old) = split(' ', $1);
	  push @old, $new;
	  for my $old (@old) {
	    $reuse->{$old} ||= { };
	    $reuse->{$old}->{reuse} = $new;
	  }
	  $reuse->{$new} ||= { };
	  $reuse->{$new}->{skip} = 1;
	}
      }
      close IN;
    }
  }

  foreach my $rule (keys %{$rules}) {
    next if ($rule eq '_scoreset');
    if (!defined $rules->{$rule}->{type}) {
      delete $rules->{$rule};   # no rule definition -> no rule
      next;
    }

    my $tflags = $rules->{$rule}->{tflags};
    if (!defined $rules->{$rule}->{score}) {
      my $def = 1.0;
      if ($rule =~ /^T_/) {
        $def = 0.01;
      }

      if ($tflags =~ /\bnice\b/) {
        $rules->{$rule}->{score} = -$def;
      } else {
        $rules->{$rule}->{score} = $def;
      }
    }

    # ignore net rules in set 0 or set 2
    if ($tflags =~ /\bnet\b/ && ($scoreset & 1) == 0) {
      $rules->{$rule}->{mutable} = 0;
      $rules->{$rule}->{score} = 0;
    }

    # ignore bayes rules in set 0 or set 2
    if ($tflags =~ /\blearn\b/ && ($scoreset & 2) == 0) {
      $rules->{$rule}->{mutable} = 0;
      $rules->{$rule}->{score} = 0;
    }

    # if a rule didn't have a score specified, assume it's
    # mutable
    if (!defined $rules->{$rule}->{mutable}) {
      $rules->{$rule}->{mutable} = 1;
    }

    # although T_ test rules are clamped to 0.01.  this works well
    # for release mass-checks, at least
    if ($rule =~ /^T_/) {
      $rules->{$rule}->{mutable} = 0;
    } elsif ($rule eq 'AWL') {	# ignore entirely
      $rules->{$rule}->{mutable} = 0;
      $rules->{$rule}->{score} = 0;
    }

  }
}

sub writerules {
  my $outfile = shift;
  # quick hack to create the tmp directory
  system ("mkdir -p $outfile 2>/dev/null ; rmdir $outfile 2>/dev/null");

  open (OUT, ">$outfile") or die "cannot write to $outfile";
  print OUT "# dumped at ".`date`."\n";

  $Data::Dumper::Purity = 1;
  print OUT Data::Dumper->Dump ([$rules, $scores, $reuse], ['*rules', '*scores', '*reuse']);

  print OUT "1;";
  close OUT;
}