corpus-hourly   [plain text]


#!/usr/bin/perl -w

# settings are located in $HOME/.corpus

use strict;
use POSIX qw(nice);
use constant MONTH => 60*60*24*30;

nice(15);

my $configuration = "$ENV{HOME}/.corpus";
my %opt;
my $revision = "unknown";
my %revision;
my @files;
my $skip = '';

&configure;
&version;
&init;
&update;
&locate;
&rename;
&current;
&clean_up;

sub version {
    my $line;
    if (open(TESTING, "$opt{tree}/rules/70_testing.cf")) {
	chomp($line = <TESTING>);
	if ($line =~ m/^#.*Rev(?:ision)?:\s*(\S+).*/) {
	    $revision = $1;
	}
	close(TESTING);
    }
}

sub configure {
    # does rough equivalent of source
    open(C, $configuration) || die "open failed: $configuration: $!\n";
    while(<C>) {
	chomp;
	s/#.*//;
	if (/^\s*(.*?)\s*=\s*(.*?)\s*$/) {
	    $opt{$1} = $2;
	}
    }
    close(C);
}

sub clean_up {
    system "rm -f $opt{tmp}/*.$$";
}

sub init {
    $SIG{INT} = \&clean_up;
    $SIG{TERM} = \&clean_up;

    $ENV{RSYNC_PASSWORD} = $opt{password};
    $ENV{TIME} = '%e,%U,%S';
    $ENV{TZ} = 'UTC';
}

sub update {
    chdir $opt{corpus};
    system "rsync -CPcvuzt --timeout=300 $opt{username}" . '@rsync.spamassassin.org::corpus/*.log .';
    if (-f "rsync.last") {
	open(FIND, "find . -type f -newer rsync.last |");
	my $files = "";
	while(<FIND>) {
	    $files .= $_;
	}
	close(FIND);
	if (! $files) {
	    print STDERR "no new corpus files\n";
	    if (rand(24) > 1) {
		exit 0;
	    }
	    else {
		print STDERR "updating anyway\n";
	    }
	}
    }
    open(RSYNC, "> rsync.last");
    close(RSYNC);
    system "chmod +r *.log";
}

sub locate {
    chdir "$opt{tree}/masses";
    opendir(CORPUS, $opt{corpus});
    @files = sort readdir(CORPUS);
    closedir(CORPUS);

    @files = grep { /^(?:spam|ham)-(?:net-)?\w+\.log$/ && -f "$opt{corpus}/$_" && -M _ < 10 } @files;
    @files = grep {
	my $time = 0;
	my $tag = 0;
	$revision{$_} = "unknown";
	open(FILE, "$opt{corpus}/$_");
	while (my $line = <FILE>) {
	    last if $line !~ /^#/;
	    $time++ if $line =~ /\b(?!08)\d\d:\d\d:\d\d\b/;
	    $revision{$_} = $1 if $line =~ m/(?:CVS|SVN) revision:\s*(\S+)/;
	}
	close(FILE);
	if (!$time) {
	    $skip .= "# skipped $_: time is between 0800 UTC and 0900 UTC\n";
	}
	$time;
    } @files;
}

sub rename {
    use File::Copy qw(move);

    my $hour = (gmtime(time))[2];
    if ($hour == 9) {
	chdir $opt{html};
	opendir(HTML, $opt{html});
	my @html = readdir(HTML);
	closedir(HTML);
	@html = grep { -f } @html;
	for (@html) {
	    move($_, "last/$_");
	}
    }
}

sub sort_all {
    my ($a1, $a2) = ($a =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);
    my ($b1, $b2) = ($b =~ m/(\(.*?\)|\S+)(?::(\S+))?$/);

    my $n = ($a1 cmp $b1) || (($a2 || '') cmp ($b2 || ''));
    if ($a1 =~ /^OVERALL/)			{ $n -= 1000; }
    elsif ($a1 =~ /^\(all messages\)/)		{ $n -= 100; }
    elsif ($a1 =~ /^\(all messages as \%\)/)	{ $n -= 10; }
    if ($b1 =~ /^OVERALL/)			{ $n += 1000; }
    elsif ($b1 =~ /^\(all messages\)/)		{ $n += 100; }
    elsif ($b1 =~ /^\(all messages as \%\)/)	{ $n += 10; }
    return $n;
}

sub time_filter {
    my ($after, $before) = @_;
    if (/time=(\d+)/) {
	return ((time - $1 >= MONTH * $after) &&
		(time - $1 < MONTH * $before));
    }
    return 0;
}

sub current {
    for my $class ("DETAILS", "HTML", "NET") {
#	for my $age ("new", "all", "age", "1day", "2day", "7day") {
	for my $age ("new", "all", "age") {
	    print STDERR "generating $class.$age\n";

	    next if ($class eq "NET" && $age !~ /^(?:new|all|age|7day)$/);

	    my @ham = grep { /^ham/ } @files;
	    my @spam = grep { /^spam/ } @files;

	    print STDERR "ham: " . join(' ', @ham) . "\n";
	    print STDERR "spam: " . join(' ', @spam) . "\n";

	    chdir $opt{corpus};

	    # net vs. local
	    if ($class eq "NET") {
		@ham = grep { /-net-/ } @ham;
		@spam = grep { /-net-/ } @spam;
		print STDERR "ham: " . join(' ', @ham) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    else {
		# if both net and local exist, use newer
		my %spam;
		my %ham;
		
		for my $file (@spam) {
		    $spam{$1}++ if ($file =~ m/-(\w+)\.log$/);
		}
		for my $file (@ham) {
		    $ham{$1}++ if ($file =~ m/-(\w+)\.log$/);
		}
		while (my ($user, $count) = each %ham) {
		    if ($count > 1) {
			my $nightly = "ham-$user.log";
			my $weekly = "ham-net-$user.log";
			if ($revision{$nightly} >= $revision{$weekly}) {
			    @ham = grep { $_ ne $weekly } @ham;
			}
			else {
			    @ham = grep { $_ ne $nightly } @ham;
			}
		    }
		}
		while (my ($user, $count) = each %spam) {
		    if ($count > 1) {
			my $nightly = "spam-$user.log";
			my $weekly = "spam-net-$user.log";
			if ($revision{$nightly} >= $revision{$weekly}) {
			    @spam = grep { $_ ne $weekly } @spam;
			}
			else {
			    @spam = grep { $_ ne $nightly } @spam;
			}
		    }
		}
		print STDERR "ham: " . join(' ', @ham) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    
	    # age
	    if ($class eq "NET" && $age ne "7day") {
		@ham = grep { -M "$_" < 10 } @ham;
		@spam = grep { -M "$_" < 10 } @spam;
		# find most recent CVS revision
		my $wanted = 0.0;
		for (@spam, @ham) {
		    $wanted = $revision{$_} if ($revision{$_} > $wanted);
		}
		@spam = grep { $revision{$_} eq $wanted } @spam;
		@ham = grep { $revision{$_} eq $wanted } @ham;
		print STDERR "ham: " . join(' ', @ham) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    elsif ($age =~ /^(?:new|all|age)$/) {
		@ham = grep { -M "$_" < -M $opt{tagtime} } @ham;
		@spam = grep { -M "$_" < -M $opt{tagtime} } @spam;
		@ham = grep { $revision{$_} eq $revision } @ham;
		@spam = grep { $revision{$_} eq $revision } @spam;
		print STDERR "ham: " . join(' ', @ham) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    elsif ($age =~ /(\d+)day/) {
		my $mtime = $1;
		@ham = grep { -M "$_" < $mtime } @ham;
		@spam = grep { -M "$_" < $mtime } @spam;
		print STDERR "ham: " . join(' ', @ham) . "\n";
		print STDERR "spam: " . join(' ', @spam) . "\n";
	    }
	    
	    open(OUT, "> $opt{html}/$class.$age");
	    print OUT "# ham results used: " . join(" ", @ham) . "\n";
	    print OUT "# spam results used: " . join(" ", @spam) . "\n";
	    for (@ham) {
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
	    }
	    for (@spam) {
		print OUT "# $_=$revision{$_}\n" if $revision{$_} ne $revision;
	    }

	    my $flags = "";
	    $flags = "-t net -s 1" if $class eq "NET";
	    $flags = "-M HTML_MESSAGE" if $class eq "HTML";

	    if ($age eq "all") {
		my %spam;
		my %ham;
		my @output;
		
		for my $file (@spam) {
		    $spam{$1} = $file if ($file =~ m/-(\w+)\.log$/);
		}
		for my $file (@ham) {
		    $ham{$1} = $file if ($file =~ m/-(\w+)\.log$/);
		}
		unlink "$opt{tmp}/ham.log.$$";
		unlink "$opt{tmp}/spam.log.$$";
		next unless (scalar keys %spam && scalar keys %ham);
		for my $user (sort keys %spam) {
		    next unless defined $ham{$user};
		    chdir "$opt{tree}/masses";
		    system("cat $opt{corpus}/$ham{$user} >> $opt{tmp}/ham.log.$$");
		    system("cat $opt{corpus}/$spam{$user} >> $opt{tmp}/spam.log.$$");
		    open(IN, "./hit-frequencies -xpa $flags $opt{corpus}/$spam{$user} $opt{corpus}/$ham{$user} |");
		    while(<IN>) {
			chomp;
			push @output, "$_:$user\n";
		    }
		    close(IN);
		}
		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
		while(<IN>) {
		    push @output, $_;
		}
		close(IN);
		for (sort sort_all @output) {
		    print OUT $_;
		}
	    }
	    elsif ($age eq "age") {
		my @output;

		for my $which (("0-1", "1-3", "3-6")) {
		    my ($after, $before) = split(/-/, $which);
		    # get and filter logs
		    chdir $opt{corpus};
		    for my $type (("ham", "spam")) {
			open(TMP, "> $opt{tmp}/$type.log.$$");
			my @array = ($type eq "ham") ? @ham : @spam;
			for my $file (@array) {
			    open(IN, $file);
			    while (<IN>) {
				print TMP $_ if time_filter($after, $before);
			    }
			    close(IN);
			}
			close (TMP);
		    }
		    # print out by age
		    chdir "$opt{tree}/masses";
		    open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
		    while(<IN>) {
			chomp;
			push @output, "$_:$which\n";
		    }
		    close(IN);
		}
		for (sort sort_all @output) {
		    print OUT $_;
		}
	    }
	    elsif (@ham && @spam) {
		# get logs
		system("cat " . join(" ", @ham) . " > $opt{tmp}/ham.log.$$");
		system("cat " . join(" ", @spam) . " > $opt{tmp}/spam.log.$$");
	
		chdir "$opt{tree}/masses";
		open(IN, "./hit-frequencies -xpa $flags $opt{tmp}/spam.log.$$ $opt{tmp}/ham.log.$$ |");
		while(<IN>) {
		    print(OUT);
		}
		close(IN);
	    }
	    close(OUT);
	}
    }
}