bayes_dump_to_trusted_networks [plain text]

#!/usr/bin/perl -w

=head1 NAME

bayes_dump_to_trusted_networks - generate configuration from Bayes database

=head1 SYNOPSIS

sa-learn --dump | bayes_dump_to_trusted_networks [opts] > trust.cf
bayes_dump_to_trusted_networks bayes.dump [opts] > trust.cf

options:

  --minham n
  --rdns

=head1 DESCRIPTION

This tool uses a dump of your Bayes database to determine which
IP addresses are 'trustworthy', and therefore should be listed in
'trusted_networks' lines in your configuration.

This will reduce unneccesary DNSBL lookups, will whitelist mails from
trustworthy sources, and allows several SpamAssassin rules to operate more
effectively.

A 'trustworthy' IP is one that is trusted not to B<forge> emails; in other
words, it's not a subverted machine running a proxy, or one under spammer
control.

As such, any IP that has relayed more than 3 ham mails is considered
trustworthy.  It doesn't matter if it has ever relayed spam mails to you, since
large ISP smarthost relays will have done so -- relaying both ham and spam from
their customer pool.  (The important thing is that it relayed the mail without
forging sender address information.)

=head1 OPTIONS

=over 4

=item --minham n

Require C<n> or more ham messages before considering an IP a candidate
for trust.  Default: 3.

=item --rdns

Annotate with reverse-DNS for that IP address.  Slows things down,
but easier to read.

=back

=head1 PREREQUISITES

C<Net::CIDR::Lite>

=cut

eval {
  require Net::CIDR::Lite;	# used to consolidate into CIDR ranges
};

die "Net::CIDR::Lite module is required to use this script.  Aborting.\n".
      "(error was: $@)\n" if $@;

use strict;

use vars qw{
  $IP_ADDRESS $IP_IN_RESERVED_RANGE
};

# an IP address, in IPv4, IPv4-mapped-in-IPv6, or IPv6 format.  NOTE: cannot
# just refer to $IPV4_ADDRESS, due to perl bug reported in nesting qr//s. :(
#
$IP_ADDRESS = qr/\b (?:IPv6:)? (?: (?:0*:0*:ffff:(?:0*:)?)? # IPv4-mapped-in-IPv6
                    (?:1\d\d|2[0-4]\d|25[0-5]|\d\d|\d)\.
                    (?:1\d\d|2[0-4]\d|25[0-5]|\d\d|\d)\.
                    (?:1\d\d|2[0-4]\d|25[0-5]|\d\d|\d)\.
                    (?:1\d\d|2[0-4]\d|25[0-5]|\d\d|\d)
                  | # an IPv6 address, seems to always be at least 6 words
                    [a-f0-9]{0,4} \:[a-f0-9]{0,4}
                    \:[a-f0-9]{0,4} \:[a-f0-9]{0,4}
                    \:[a-f0-9]{0,4} \:[a-f0-9]{0,4} (?:\:[a-f0-9]{0,4})*
                  )\b/ix;

$IP_IN_RESERVED_RANGE = qr{^(?:
  192\.168|                        # 192.168/16:              Private Use
  10|                              # 10/8:                    Private Use
  172\.(?:1[6-9]|2[0-9]|3[01])|    # 172.16-172.31/16:        Private Use
  169\.254|                        # 169.254/16:              Private Use (APIPA)
  127|                             # 127/8:                   Private Use (local host)
  [01257]|                         # 000-002/8, 005/8, 007/8: Reserved
  2[37]|                           # 023/8, 027/8:            Reserved
  3[179]|                          # 031/8, 037/8, 039/8:     Reserved
  4[12]|                           # 041/8, 042/8:            Reserved
  5[89]|                           # 058/8, 059/8:            Reserved
  60|                              # 060/8:                   Reserved
  7[0-9]|                          # 070-079/8:               Reserved
  9[0-9]|                          #  -
  1[01][0-9]|                      #  -
  12[0-6]|                         # 126/8:                   Reserved
  197|                             # 197/8:                   Reserved
  22[23]|                          # 222/8, 223/8:            Reserved
  24[0-9]|                         # 240-
  25[0-5]                         # 255/8:                   Reserved
)\.}x;

use Getopt::Long;

sub usage {
  die "
usage: bayes_dump_to_trusted_networks [--minham n] [--rdns] [file]
";
}

use vars qw(
                $opt_minham $opt_rdns $opt_help 
        );

GetOptions(
  'minham:i'		=> \$opt_minham,
  'rdns'		=> \$opt_rdns,
  'help'                => \$opt_help
) or usage();
$opt_help and usage();

$opt_rdns ||= 0;
$opt_minham ||= 3;

my %class_cs = ();
while (<>) {
  my ($prob, $nspam, $nham, $atime, $tok) = split;

  # only select IP-address Received tokens
  next if ($tok !~ /^H\*r:ip\*(${IP_ADDRESS})/o);
  my $ip = $1;

  next unless ($nham >= $opt_minham);	# has relayed >= n ham mails
  next if ($ip =~ /$IP_IN_RESERVED_RANGE/o);
  $ip =~ s/[^0-9\.\:]/_/gs;	# sanitise!

  $ip =~ /^(.*)\.(\d+)/;

  if (defined $class_cs{$1}) {
    $class_cs{$1} .= " ".$2;
  } else {
    $class_cs{$1} = $2;
  }
}

foreach my $class_c (sort {
	classc2long($a) <=> classc2long($b)
      } keys %class_cs)
{
  my $cidr = Net::CIDR::Lite->new;
  foreach my $octet (split (' ', $class_cs{$class_c})) {
    my $ip = "$class_c.$octet";
    $cidr->add_ip ($ip);
  }
  $cidr->clean();

  my @ips = $cidr->list();

  foreach my $ip (@ips) {
    my $rdns = '';
    if ($ip =~ s/\/32$//) {
      if ($opt_rdns) {
	my $host = `host -W 1 $ip 2>&1`;
	$host =~ s/^.* domain name pointer (\S+)\s*$/$1/gm;
	$host =~ s/^.*not found: .*$/(no rdns)/gs;
	$host =~ s/\s+/ /gs;	# (newlines)
	$host =~ s/\.$//;
	$rdns = "\t# $host";
      }
    } else {
      if ($opt_rdns) {
	$rdns = "\t# (range)";
      }
    }

    print "trusted_networks $ip$rdns\n";
  }
}

sub classc2long {
  my $ip = shift;
  ($ip =~ /^(\d+)\.(\d+)\.(\d+)$/) or return -1;
  ($1 << 16) | ($2 << 8) | ($3);
}