split_corpora   [plain text]


#!/usr/bin/perl -w

use FindBin;
use lib "$FindBin::Bin/../lib";

use strict;

use Mail::SpamAssassin::ArchiveIterator;
use Getopt::Std;
use FileHandle;

###########

sub usage {
  print STDERR "split-corpora [-n num_buckets] [-p outfile_prefix] ".
	"[-l max_messages] ".
	"folder1 ....\n";
  exit(1);
} # usage()

###########

our ($opt_n, $opt_p, $opt_h, $opt_l);

getopt('n:p:l:h');

usage() if ($opt_h);

my $num_buckets = $opt_n || 2;
my $prefix      = $opt_p || "bucket";
my @IN_FILES    = @ARGV;

usage() if (@IN_FILES == 0);

my @targets = ();
foreach (@IN_FILES) {
  if (-d $_) {
    push (@targets, "ham:dir:$_");
  } else {
    push (@targets, "ham:mbox:$_");
  }
}

my @bucket_fhs = ();
foreach my $bucket (1 .. $num_buckets) {
  my $bucket_fh = new FileHandle();

  if (!$bucket_fh->open(">$prefix.$bucket")) {
    die "Could not open '$prefix.$bucket' for writing: $!\n";
  }

  push(@bucket_fhs, $bucket_fh);
} # foreach my $bucket (1 .. $num_buckets)

my $current_bucket = 0;

my $iter = new Mail::SpamAssassin::ArchiveIterator({
        'opt_j' => 1,
        'opt_n' => 1,
        'opt_all' => 1,
  });

$iter->set_functions(\&wanted, sub { });
my $messagecount = 0;

eval {
  $iter->run(@targets);
};
if ($@) { die $@ unless ($@ =~ /HITLIMIT/); }

foreach my $fh (@bucket_fhs) {
  $fh->close();
}
if ($opt_l && $messagecount < $opt_l) {
  warn "warning: only found $messagecount messages instead of $opt_l\n";
}

#############################################

sub wanted {
  my (undef, $msg_id, $time, $data_ref) = @_;

  if ($opt_l && $messagecount++ > $opt_l) { die 'HITLIMIT'; }

  # Make sure message can be used for outputing mbox format
  if ($data_ref->[0] !~ /^From \S+ +... ... /) {
    unshift(@$data_ref, "From abc\@xyz.com Mon Jan  1 00:00:00 2000\n");
  }

  $bucket_fhs[$current_bucket]->print( join("", @$data_ref) );

  $current_bucket = ($current_bucket + 1) % $num_buckets;
} # wanted()