mass-check-results-to-mbox   [plain text]


#!/usr/bin/perl
#
# very handy for e.g.:
#
#   grep SUBJECT_FREQ spam.log | ./mass-check-results-to-mbox | grep Subject:
#
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

my $grep = undef;
my $annotate = 1;
while ($#ARGV >= 0) {
  $_ = $ARGV[0]; shift;
  if ($_ eq '-grep') { $grep = $ARGV[0]; shift; }
  if ($_ eq '-noannotate') { $annotate = 0; }
}

while (<>) {
  s/^[^\s:]+://;  # filenames, from "grep foo *"

  next if /^#/;
  /^.\s+-?\d+\s+(\S+) / or next;
  my $mail = $1;

  if ($mail =~ /^(\S+):</) {
    my $msgp = find_in_mailbox ($mail);
    if (defined $msgp) {
      $annotate and unshift (@$msgp, "X-Mass-Check-Id: $mail\n");
      handle ($msgp);
    } else {
      mywarn ("failed to find message for $mail\n");
    }

  } else {
    if ($mail =~ /\.gz$/) {
      open (IN, "gunzip -cd $mail |") or mywarn ("gunzip $mail failed: $@");
    } elsif ($mail =~ /\.bz2$/) {
      open (IN, "bzip2 -cd $mail |") or mywarn ("bunzip2 $mail failed: $@");
    } else {
      open (IN, "<$mail") or mywarn ("open $mail failed: $@");
    }
    my @msg = (<IN>); close IN;

    while (scalar @msg > 0 &&
	$msg[0] =~ /^(?:From|X-Mass-Check-Id:) /)
    {
      shift @msg;
    }
    $annotate and unshift (@msg, "X-Mass-Check-Id: $mail\n");

    handle (\@msg);
  }
}

###########################################################################

sub find_in_mailbox {
  my ($mail) = @_;
  $mail =~ /^(\S+):</;
  $folder = $1; my $wantid = $_;

  if (defined $CURRENT_MBOX_OPEN && $folder eq $CURRENT_MBOX_OPEN) {
    # try from current position first
    my $msgp = mbox_search($mail, $folder);
    if (defined ($msgp->[0])) { return $msgp; }
  }

  # failed. have to (re-|)open.
  if ($folder =~ /\.gz$/) {
    open (MBOX, "gunzip -cd $folder |") or mywarn ("gunzip $folder failed: $@");
  } elsif ($folder =~ /\.bz2$/) {
    open (MBOX, "bzip2 -cd $folder |") or mywarn ("bunzip2 $folder failed: $@");
  } else {
    open (MBOX, "<$folder") or mywarn ("open $folder failed: $@");
  }

  $CURRENT_MBOX_OPEN = $folder;
  while (<MBOX>) { /^From \S+ +... ... / and last; }
  my $msgp = mbox_search($mail, $folder);
  return $msgp;
}

sub mbox_search {
  my ($mail, $folder) = @_;
  my $wantid = $mail;
  
  my $count = 0;
  my $host  = $ENV{'HOSTNAME'} || $ENV{'HOST'} || `hostname` || 'localhost';

  while (!eof MBOX) {
    my @msg = ();
    my $msgid = undef;
    my $in_header = 1;
    $count++;

    while (<MBOX>) {
      if (/^$/ && $in_header) {
        $in_header = 0 ;

        if (!defined ($msgid)) {
          $msgid = sprintf('<no-msgid-in-msg-%06d@%s.masses.spamassasin.org>', $count, $host);
          push (@msg, "Message-Id: $msgid\n");
        }
      }
      if ($in_header) {
        /^Message-Id: (.*)\s*$/i and $msgid = $1;
      }

      /^From \S+ +... ... / and last;
      push (@msg, $_);
    }

    $msgid = "$folder:$msgid";	# so we can find it again
    $msgid =~ s/\s/_/gs;	# make safe

    # print "JMD $wantid $msgid\n";

    if ($wantid ne $msgid) { next; }
    return \@msg;
  }

  close MBOX; $CURRENT_MBOX_OPEN = undef;
}

###########################################################################

sub handle {
  my $msgp = shift;
  print STDOUT "From nobody\@nowhere  Wed Aug 21 12:41:07 2002\n", @$msgp, "\n";
}

sub mywarn {
  warn @_;
  if ($annotate) { print "X-Mass-Check-Warning: ".join ('',@_)."\n"; }
}