use strict;
use POSIX qw(nice);
use constant MONTH => 60*60*24*30;
nice(15);
my $configuration = "$ENV{HOME}/.corpus";
my %opt;
my $revision = "unknown";
my %revision;
my @files;
my $skip = '';
&configure;
&version;
&init;
&update;
&locate;
&rename;
¤t;
&clean_up;
sub version {
my $line;
if (open(TESTING, "$opt{tree}/rules/70_testing.cf")) {
chomp($line = <TESTING>);
if ($line =~ m/^ $revision = $1;
}
close(TESTING);
}
}
sub configure {
open(C, $configuration) || die "open failed: $configuration: $!\n";
while(<C>) {
chomp;
s/ if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
$opt{$1} = $2;
}
}
close(C);
}
sub clean_up {
system "rm -f $opt{tmp}/*.$$";
}
sub init {
$SIG{INT} = \&clean_up;
$SIG{TERM} = \&clean_up;
$ENV{RSYNC_PASSWORD} = $opt{password};
$ENV{TIME} = '%e,%U,%S';
$ENV{TZ} = 'UTC';
}
sub update {
chdir $opt{corpus};
system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
if (-f "rsync.last") {
open(FIND, "find . -type f -newer rsync.last |");
my $files = "";
while(<FIND>) {
$files .= $_;
}
close(FIND);
if (! $files) {
print STDERR "no new corpus files\n";
if (rand(24) > 1) {
exit 0;
}
else {
print STDERR "updating anyway\n";
}
}
}
open(RSYNC, "> rsync.last");
close(RSYNC);
system "chmod +r *.log";
}
sub locate {
chdir "$opt{tree}/masses";
opendir(CORPUS, $opt{corpus});
@files = sort readdir(CORPUS);
closedir(CORPUS);
@files = grep { /^(?:spam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
@files = grep {
my $time = 0;
my $tag = 0;
$revision{$_} = "unknown";
open(FILE, "$opt{corpus}/$_");
while (my $line = <FILE>) {
last if $line !~ /^ $time++ if $line =~ /\b(?!08)\d\d:\d\d:\d\d\b/;
$revision{$_} = $1 if $line =~ m/(?:CVS|SVN) revision:\s*(\S+)/;
}
close(FILE);
if (!$time) {
$skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n";
}
$time;
} @files;
}
sub rename {
use File::Copy qw(move);
my $hour = (gmtime(time))[2];
if ($hour == 9) {
chdir $opt{html};
opendir(HTML, $opt{html});
my @html = readdir(HTML);
closedir(HTML);
@html = grep { -f } @html;
for (@html) {
move($_, "last/$_");
}
}
}
sub sort_all {
my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
if ($a1 =~ /^OVERALL/) { $n -= 1000; }
elsif ($a1 =~ /^\(all messages\)/) { $n -= 100; }
elsif ($a1 =~ /^\(all messages as \%\)/) { $n -= 10; }
if ($b1 =~ /^OVERALL/) { $n += 1000; }
elsif ($b1 =~ /^\(all messages\)/) { $n += 100; }
elsif ($b1 =~ /^\(all messages as \%\)/) { $n += 10; }
return $n;
}
sub time_filter {
my ($after, $before) = @_;
if (/time=(\d+)/) {
return ((time - $1 >= MONTH * $after) &&
(time - $1 < MONTH * $before));
}
return 0;
}
sub current {
for my $class ("DETAILS", "HTML", "NET") {
for my $age ("new", "all", "age") {
print STDERR "generating $class.$age\n";
next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);
my @ham = grep { /^ham/ } @files;
my @spam = grep { /^spam/ } @files;
print STDERR "ham: " . join(' ', @ham) . "\n";
print STDERR "spam: " . join(' ', @spam) . "\n";
chdir $opt{corpus};
if ($class eq "NET") {
@ham = grep { /-net-/ } @ham;
@spam = grep { /-net-/ } @spam;
print STDERR "ham: " . join(' ', @ham) . "\n";
print STDERR "spam: " . join(' ', @spam) . "\n";
}
else {
my %spam;
my %ham;
for my $file (@spam) {
$spam{$1}++ if ($file =~ m/-(\w+)\.log$/);
}
for my $file (@ham) {
$ham{$1}++ if ($file =~ m/-(\w+)\.log$/);
}
while (my ($user, $count) = each %ham) {
if ($count > 1) {
my $nightly = "ham-$user.log";
my $weekly = "ham-net-$user.log";
if ($revision{$nightly} >= $revision{$weekly}) {
@ham = grep { $_ ne $weekly } @ham;
}
else {
@ham = grep { $_ ne $nightly } @ham;
}
}
}
while (my ($user, $count) = each %spam) {
if ($count > 1) {
my $nightly = "spam-$user.log";
my $weekly = "spam-net-$user.log";
if ($revision{$nightly} >= $revision{$weekly}) {
@spam = grep { $_ ne $weekly } @spam;
}
else {
@spam = grep { $_ ne $nightly } @spam;
}
}
}
print STDERR "ham: " . join(' ', @ham) . "\n";
print STDERR "spam: " . join(' ', @spam) . "\n";
}
if ($class eq "NET" && $age ne "7day") {
@ham = grep { -M "$_" < 10 } @ham;
@spam = grep { -M "$_" < 10 } @spam;
my $wanted = 0.0;
for (@spam, @ham) {
$wanted = $revision{$_} if ($revision{$_} > $wanted);
}
@spam = grep { $revision{$_} eq $wanted } @spam;
@ham = grep { $revision{$_} eq $wanted } @ham;
print STDERR "ham: " . join(' ', @ham) . "\n";
print STDERR "spam: " . join(' ', @spam) . "\n";
}
elsif ($age =~ /^(?:new|all|age)$/) {
@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
@ham = grep { $revision{$_} eq $revision } @ham;
@spam = grep { $revision{$_} eq $revision } @spam;
print STDERR "ham: " . join(' ', @ham) . "\n";
print STDERR "spam: " . join(' ', @spam) . "\n";
}
elsif ($age =~ /(\d+)day/) {
my $mtime = $1;
@ham = grep { -M "$_" < $mtime } @ham;
@spam = grep { -M "$_" < $mtime } @spam;
print STDERR "ham: " . join(' ', @ham) . "\n";
print STDERR "spam: " . join(' ', @spam) . "\n";
}
open(OUT, "> $opt{html}/$class.$age");
print OUT "# ham results used: " . join(" ", @ham) . "\n";
print OUT "# spam results used: " . join(" ", @spam) . "\n";
for (@ham) {
print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
}
for (@spam) {
print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
}
my $flags = "";
$flags = "-t net -s 1" if $class eq "NET";
$flags = "-M HTML_MESSAGE" if $class eq "HTML";
if ($age eq "all") {
my %spam;
my %ham;
my @output;
for my $file (@spam) {
$spam{$1} = $file if ($file =~ m/-(\w+)\.log$/);
}
for my $file (@ham) {
$ham{$1} = $file if ($file =~ m/-(\w+)\.log$/);
}
unlink "$opt{tmp}/ham.log.$$";
unlink "$opt{tmp}/spam.log.$$";
next unless (scalar keys %spam && scalar keys %ham);
for my $user (sort keys %spam) {
next unless defined $ham{$user};
chdir "$opt{tree}/masses";
system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$");
system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$");
open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |");
while(<IN>) {
chomp;
push @output, "$_:$user\n";
}
close(IN);
}
open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) {
push @output, $_;
}
close(IN);
for (sort sort_all @output) {
print OUT $_;
}
}
elsif ($age eq "age") {
my @output;
for my $which (("0-1", "1-3", "3-6")) {
my ($after, $before) = split(/-/, $which);
chdir $opt{corpus};
for my $type (("ham", "spam")) {
open(TMP, "> $opt{tmp}/$type.log.$$");
my @array = ($type eq "ham") ? @ham : @spam;
for my $file (@array) {
open(IN, $file);
while (<IN>) {
print TMP $_ if time_filter($after, $before);
}
close(IN);
}
close (TMP);
}
chdir "$opt{tree}/masses";
open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) {
chomp;
push @output, "$_:$which\n";
}
close(IN);
}
for (sort sort_all @output) {
print OUT $_;
}
}
elsif (@ham && @spam) {
system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
chdir "$opt{tree}/masses";
open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
while(<IN>) {
print(OUT);
}
close(IN);
}
close(OUT);
}
}
}