=head1 NAME
Mail::SpamAssassin::Bayes - determine spammishness using a Bayesian classifier
=head1 SYNOPSIS
=head1 DESCRIPTION
This is a Bayesian-like form of probability-analysis classification, using an
algorithm based on the one detailed in Paul Graham's I<A Plan For Spam> paper
at:
http://www.paulgraham.com/
It also incorporates some other aspects taken from Graham Robinson's webpage
on the subject at:
http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
And the chi-square probability combiner as described here:
http://www.linuxjournal.com/print.php?sid=6467
The results are incorporated into SpamAssassin as the BAYES_* rules.
=head1 METHODS
=over 4
=cut
package Mail::SpamAssassin::Bayes;
use strict;
use bytes;
use Mail::SpamAssassin;
use Mail::SpamAssassin::PerMsgStatus;
use Digest::SHA1 qw(sha1 sha1_hex);
use vars qw{
@ISA
$IGNORED_HDRS
$MARK_PRESENCE_ONLY_HDRS
%HEADER_NAME_COMPRESSION
$OPPORTUNISTIC_LOCK_VALID
};
@ISA = qw();
$IGNORED_HDRS = qr{(?: (?:X-)?Sender |Delivered-To |Delivery-Date
|(?:X-)?Envelope-To
|X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
|Subject
|Date
|X-List|(?:X-)?Mailing-List
|(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
|Unsubscribe|Host|Id|Manager|Admin|Comment
|Name|Url)
|X-Unsub(?:scribe)?
|X-Mailman-Version |X-Been[Tt]here |X-Loop
|Mail-Followup-To
|X-eGroups-(?:Return|From)
|X-MDMailing-List
|X-XEmacs-List
|(?:X-)?Resent-(?:From|To|Date)
|(?:X-)?Original-(?:From|To|Date)
|X-MailScanner(?:-SpamCheck)?
|X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
|X-Antispam |X-RBL-Warning |X-Mailscanner
|X-MDaemon-Deliver-To |X-Virus-Scanned
|X-Mass-Check-Id
|X-Pyzor |X-DCC-\S{2,25}-Metrics
|X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
|X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
|X-SpamCop-[^:]+
|X-SMTPD |(?:X-)?Spam-Apparently-To
|SPAM |X-Perlmx-Spam
|X-Bogosity
|Content-Class |Thread-(?:Index|Topic)
|X-Original[Aa]rrival[Tt]ime
|(?:X-)?Status |X-Flags |Replied |Forwarded
|Lines |Content-Length
|X-UIDL? |X-IMAPbase
|X-Bugzilla-[^:]+
|X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
|Summary-Format|VHeader|v\d-Data|Message-Order)
| X-Gnus-Mail-Source
| Xref
)}x;
$MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
|X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
)}ix;
use constant IGNORE_TITLE_CASE => 1;
use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 1;
use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
use constant PRE_CHEW_ADDR_HEADERS => 1;
use constant CHEW_BODY_URIS => 1;
use constant CHEW_BODY_MAILADDRS => 1;
use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
use constant IGNORE_MSGID_TOKENS => 0;
use constant DECOMPOSE_BODY_TOKENS => 1;
use constant MAP_HEADERS_MID => 1;
use constant MAP_HEADERS_FROMTOCC => 1;
use constant MAP_HEADERS_USERAGENT => 1;
use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
%HEADER_NAME_COMPRESSION = (
'Message-Id' => '*m',
'Message-ID' => '*M',
'Received' => '*r',
'User-Agent' => '*u',
'References' => '*f',
'In-Reply-To' => '*i',
'From' => '*F',
'Reply-To' => '*R',
'Return-Path' => '*p',
'Return-path' => '*rp',
'X-Mailer' => '*x',
'X-Authentication-Warning' => '*a',
'Organization' => '*o',
'Organisation' => '*o',
'Content-Type' => '*c',
'X-Spam-Relays-Trusted' => '*RT',
'X-Spam-Relays-Untrusted' => '*RU',
);
$OPPORTUNISTIC_LOCK_VALID = 300;
use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
use constant CHI_ROBINSON_X_CONSTANT => 0.538;
use constant GARY_ROBINSON_X_CONSTANT => 0.600;
use constant CHI_ROBINSON_S_CONSTANT => 0.100;
use constant GARY_ROBINSON_S_CONSTANT => 0.160;
use constant CHI_ROBINSON_MIN_PROB_STRENGTH => 0.346;
use constant GARY_ROBINSON_MIN_PROB_STRENGTH => 0.430;
use constant N_SIGNIFICANT_TOKENS => 150;
use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
use constant MAX_TOKEN_LENGTH => 15;
sub new {
my $class = shift;
$class = ref($class) || $class;
my ($main) = @_;
my $self = {
'main' => $main,
'conf' => $main->{conf},
'log_raw_counts' => 0,
'use_ignores' => 1,
'tz' => Mail::SpamAssassin::Util::local_tz(),
};
bless ($self, $class);
if ($self->{conf}->{bayes_store_module}) {
my $module = $self->{conf}->{bayes_store_module};
my $store;
eval '
require '.$module.';
$store = '.$module.'->new($self);
';
if ($@) { die $@; }
$self->{store} = $store;
}
else {
require Mail::SpamAssassin::BayesStore::DBM;
$self->{store} = Mail::SpamAssassin::BayesStore::DBM->new($self);
}
$self;
}
sub finish {
my $self = shift;
$self->{store}->untie_db();
}
sub sanity_check_is_untied {
my $self = shift;
if ($self->{store}->db_readable()) {
warn "SpamAssassin: oops! still tied to bayes DBs, untie'ing\n";
$self->{store}->untie_db();
}
}
sub read_db_configs {
my ($self) = @_;
$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
$self->{use_chi_sq_combining} = $self->{conf}->{bayes_use_chi2_combining};
if ($self->{use_chi_sq_combining}) {
$self->{robinson_x_constant} = CHI_ROBINSON_X_CONSTANT;
$self->{robinson_s_constant} = CHI_ROBINSON_S_CONSTANT;
$self->{robinson_min_prob_strength} = CHI_ROBINSON_MIN_PROB_STRENGTH;
} else {
$self->{robinson_x_constant} = GARY_ROBINSON_X_CONSTANT;
$self->{robinson_s_constant} = GARY_ROBINSON_S_CONSTANT;
$self->{robinson_min_prob_strength} = GARY_ROBINSON_MIN_PROB_STRENGTH;
}
$self->{robinson_s_times_x} =
($self->{robinson_x_constant} * $self->{robinson_s_constant});
}
sub tokenize {
my ($self, $msg, $msgdata) = @_;
my @tokens = map { $self->tokenize_line ($_, '', 1) }
@{$msgdata->{bayes_token_body}};
push (@tokens, map { $self->tokenize_line ($_, '', 2) }
@{$msgdata->{bayes_token_uris}});
if (ADD_INVIZ_TOKENS_I_PREFIX) {
push (@tokens, map { $self->tokenize_line ($_, "I*:", 1) }
@{$msgdata->{bayes_token_inviz}});
}
if (ADD_INVIZ_TOKENS_NO_PREFIX) {
push (@tokens, map { $self->tokenize_line ($_, "", 1) }
@{$msgdata->{bayes_token_inviz}});
}
my %hdrs = $self->tokenize_headers ($msg);
while( my($prefix, $value) = each %hdrs ) {
push(@tokens, $self->tokenize_line ($value, "H$prefix:", 0));
}
my %tokens;
foreach my $token (@tokens) {
next unless length($token); $tokens{substr(sha1($token), -5)} = $token;
}
return \%tokens;
}
sub tokenize_line {
my $self = $_[0];
my $tokprefix = $_[2];
my $region = $_[3];
local ($_) = $_[1];
my @rettokens = ();
tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
# DO split on "..." or "--" or "---"; common formatting error resulting in
# hapaxes. Keep the separator itself as a token, though, as long ones can
# be good spamsigns.
s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
if (IGNORE_TITLE_CASE) {
if ($region == 1 || $region == 2) {
# lower-case Title Case at start of a full-stop-delimited line (as would
# be seen in a Western language).
s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
}
}
my $magic_re = $self->{store}->get_magic_re();
foreach my $token (split) {
$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
next if ( defined $magic_re && /$magic_re/ );
my $len = length($token);
next if $len < 3 ||
($token =~ /^(?:a(?:nd|ny|ble|ll|re)|
m(?:uch|ost|ade|ore|ail|ake|ailing|any|ailto)|
t(?:his|he|ime|hrough|hat)|
w(?:hy|here|ork|orld|ith|ithout|eb)|
f(?:rom|or|ew)| e(?:ach|ven|mail)|
o(?:ne|ff|nly|wn|ut)| n(?:ow|ot|eed)|
s(?:uch|ame)| l(?:ook|ike|ong)|
y(?:ou|our|ou're)|
The|has|have|into|using|http|see|It's|it's|
number|just|both|come|years|right|know|already|
people|place|first|because|
And|give|year|information|can)$/x);
# are we in the body? If so, apply some body-specific breakouts
if ($region == 1 || $region == 2) {
if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
push (@rettokens, $self->tokenize_mail_addrs ($token));
}
elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
push (@rettokens, "UD:".$token); # the full token
my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
push (@rettokens, "UD:".$1); # UD = URL domain
}
}
}
# note: do not trim down overlong tokens if they contain '*'. This is
# used as part of split tokens such as "HTo:D*net" indicating that
# the domain ".net" appeared in the To header.
#
if ($len > MAX_TOKEN_LENGTH && $token !~ /\*/) {
if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
# to me! (jm)
while ($token =~ s/^(..?)//) {
push (@rettokens, "8:$1");
}
next;
}
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
{
# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
# Spambayes trick via Matt: Just retain 7 chars. Do not retain
# the length, it does not help; see my mail to -devel of Nov 20 2002.
# "sk:" stands for "skip".
$token = "sk:".substr($token, 0, 7);
}
}
# decompose tokens? do this after shortening long tokens
if ($region == 1 || $region == 2) {
if (DECOMPOSE_BODY_TOKENS) {
if ($token =~ /[^\w:\*]/) {
my $decompd = $token; # "Foo!"
$decompd =~ s/[^\w:\*]//gs;
push (@rettokens, $tokprefix.$decompd); # "Foo"
}
if ($token =~ /[A-Z]/) {
my $decompd = $token; $decompd = lc $decompd;
push (@rettokens, $tokprefix.$decompd); # "foo!"
if ($token =~ /[^\w:\*]/) {
$decompd =~ s/[^\w:\*]//gs;
push (@rettokens, $tokprefix.$decompd); # "foo"
}
}
}
}
push (@rettokens, $tokprefix.$token);
}
return @rettokens;
}
sub tokenize_headers {
my ($self, $msg) = @_;
my %parsed = ();
my %user_ignore;
$user_ignore{$_} = 1 for @{$self->{main}->{conf}->{bayes_ignore_headers}};
# get headers in array context
my @hdrs;
my @rcvdlines;
for ($msg->get_all_headers()) {
# first, keep a copy of Received headers, so we can strip down to last 2
if (/^Received:/i) {
push(@rcvdlines, $_);
next;
}
# and now skip lines for headers we don't want (including all Received)
next if /^${IGNORED_HDRS}:/i;
next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
push(@hdrs, $_);
}
push(@hdrs, $msg->get_all_metadata());
# and re-add the last 2 received lines: usually a good source of
# spamware tokens and HELO names.
if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
for (@hdrs) {
next unless /\S/;
my ($hdr, $val) = split(/:/, $_, 2);
# remove user-specified headers here, after Received, in case they
# want to ignore that too
next if exists $user_ignore{$hdr};
# Prep the header value
$val ||= '';
chomp($val);
# special tokenization for some headers:
if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
$val = $self->pre_chew_message_id ($val);
}
elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
{
$val = $self->pre_chew_addr_header ($val);
}
elsif ($hdr eq 'Received') {
$val = $self->pre_chew_received ($val);
}
elsif ($hdr eq 'Content-Type') {
$val = $self->pre_chew_content_type ($val);
}
elsif ($hdr eq 'MIME-Version') {
$val =~ s/1\.0//; # totally innocuous
}
elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
$val = "1"; # just mark the presence, they create lots of hapaxen
}
if (MAP_HEADERS_MID) {
if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
$parsed{"*MI"} = $val;
}
}
if (MAP_HEADERS_FROMTOCC) {
if ($hdr =~ /^(?:From|To|Cc)$/i) {
$parsed{"*Ad"} = $val;
}
}
if (MAP_HEADERS_USERAGENT) {
if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
$parsed{"*UA"} = $val;
}
}
# replace hdr name with "compressed" version if possible
if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
$hdr = $HEADER_NAME_COMPRESSION{$hdr};
}
if (exists $parsed{$hdr}) {
$parsed{$hdr} .= " ".$val;
} else {
$parsed{$hdr} = $val;
}
dbg ("tokenize: header tokens for $hdr = \"$parsed{$hdr}\"");
}
return %parsed;
}
sub pre_chew_content_type {
my ($self, $val) = @_;
# hopefully this will retain good bits without too many hapaxen
if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
my $boundary = $1;
$boundary =~ s/[a-fA-F0-9]/H/gs;
# break up blocks of separator chars so they become their own tokens
$boundary =~ s/([-_\.=]+)/ $1 /gs;
$val .= $boundary;
}
# stop-list words for Content-Type header: these wind up totally gray
$val =~ s/\b(?:text|charset)\b//;
$val;
}
sub pre_chew_message_id {
my ($self, $val) = @_;
# we can (a) get rid of a lot of hapaxen and (b) increase the token
# specificity by pre-parsing some common formats.
# Outlook Express format:
$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
([0-9a-f]{4})[0-9a-f]{4}\$
([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
# Exim:
$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
# Sendmail:
$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
[A-F0-9]{10,12}\@//gx;
# try to split Message-ID segments on probable ID boundaries. Note that
# Outlook message-ids seem to contain a server identifier ID in the last
# 8 bytes before the @. Make sure this becomes its own token, it's a
# great spam-sign for a learning system! Be sure to split on ".".
$val =~ s/[^_A-Za-z0-9]/ /g;
$val;
}
sub pre_chew_received {
my ($self, $val) = @_;
# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
# and valid-format RFC-822/2822 dates
$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
$val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
[0-3\s]?[0-9]\s
(?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
(?:19|20)?[0-9]{2}\s
[0-2][0-9](?:\:[0-5][0-9]){1,2}\s
(?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
//gx;
# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
# (on both sides)
# also make a dup with the full IP, as fodder for
# bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
$1.$2.$3.$4.
" ip*".$1.$2.$3.$4." ";
} else {
$1.$2.$3.
" ip*".$1.$2.$3.$4." ";
}
}gex;
# trim these: they turn out as the most common tokens, but with a
# prob of about .5. waste of space!
$val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
$val;
}
sub pre_chew_addr_header {
my ($self, $val) = @_;
local ($_);
my @addrs = $self->{main}->find_all_addrs_in_line ($val);
my @toks = ();
foreach (@addrs) {
push (@toks, $self->tokenize_mail_addrs ($_));
}
return join (' ', @toks);
}
sub tokenize_mail_addrs {
my ($self, $addr) = @_;
($addr =~ /(.+)\@(.+)$/) or return ();
my @toks = ();
push(@toks, "U*".$1, "D*".$2);
$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
return @toks;
}
###########################################################################
sub ignore_message {
my ($self,$PMS) = @_;
return 0 unless $self->{use_ignores};
my $ignore = $PMS->check_from_in_list('bayes_ignore_from')
|| $PMS->check_to_in_list('bayes_ignore_to');
dbg("Not using Bayes, bayes_ignore_from or _to rule") if $ignore;
return $ignore;
}
###########################################################################
sub learn {
my ($self, $isspam, $msg, $id) = @_;
if (!$self->{conf}->{use_bayes}) { return; }
if (!defined $msg) { return; }
if( $self->{use_ignores} ) # Remove test when PerMsgStatus available.
{
# DMK, koppel@ece.lsu.edu: Hoping that the ultimate fix to bug 2263 will
# make it unnecessary to construct a PerMsgStatus here.
my $PMS = new Mail::SpamAssassin::PerMsgStatus $self->{main}, $msg;
my $ignore = $self->ignore_message($PMS);
$PMS->finish();
return if $ignore;
}
my $msgdata = $self->get_body_from_msg ($msg);
my $ret;
eval {
local $SIG{'__DIE__'}; # do not run user die() traps in here
my $ok;
if ($self->{main}->{learn_to_journal}) {
# If we're going to learn to journal, we'll try going r/o first...
# If that fails for some reason, let's try going r/w. This happens
# if the DB doesn't exist yet.
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
} else {
$ok = $self->{store}->tie_db_writable();
}
if ($ok) {
$ret = $self->learn_trapped ($isspam, $msg, $msgdata, $id);
if (!$self->{main}->{learn_caller_will_untie}) {
$self->{store}->untie_db();
}
}
};
if ($@) { # if we died, untie the dbs.
my $failure = $@;
$self->{store}->untie_db();
die $failure;
}
return $ret;
}
# this function is trapped by the wrapper above
sub learn_trapped {
my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
my @msgid = ( $msgid );
if (!defined $msgid) {
@msgid = $self->get_msgid($msg);
}
foreach $msgid ( @msgid ) {
my $seen = $self->{store}->seen_get ($msgid);
if (defined ($seen)) {
if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
dbg ("$msgid: already learnt correctly, not learning twice");
return 0;
} elsif ($seen !~ /^[hs]$/) {
warn ("db_seen corrupt: value='$seen' for $msgid. ignored");
} else {
dbg ("$msgid: already learnt as opposite, forgetting first");
# kluge so that forget() won't untie the db on us ...
my $orig = $self->{main}->{learn_caller_will_untie};
$self->{main}->{learn_caller_will_untie} = 1;
my $fatal = !defined $self->forget ($msg);
# reset the value post-forget() ...
$self->{main}->{learn_caller_will_untie} = $orig;
# forget() gave us a fatal error, so propagate that up
if ($fatal) {
dbg("forget() returned a fatal error, so learn() will too");
return;
}
}
# we're only going to have seen this once, so stop if it's been
# seen already
last;
}
}
# Now that we're sure we haven't seen this message before ...
$msgid = $msgid[0];
if ($isspam) {
$self->{store}->nspam_nham_change (1, 0);
} else {
$self->{store}->nspam_nham_change (0, 1);
}
my $msgatime = $msg->receive_date();
# If the message atime comes back as being more than 1 day in the
# future, something's messed up and we should revert to current time as
# a safety measure.
#
$msgatime = time if ( $msgatime - time > 86400 );
my $tokens = $self->tokenize($msg, $msgdata);
for my $token (keys %{$tokens}) {
if ($isspam) {
$self->{store}->tok_count_change (1, 0, $token, $msgatime);
} else {
$self->{store}->tok_count_change (0, 1, $token, $msgatime);
}
}
$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
$self->{store}->cleanup();
$self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
isspam => $isspam,
msgid => $msgid,
msgatime => $msgatime,
});
dbg("bayes: Learned '$msgid', atime: $msgatime");
1;
}
###########################################################################
sub forget {
my ($self, $msg, $id) = @_;
if (!$self->{conf}->{use_bayes}) { return; }
if (!defined $msg) { return; }
my $msgdata = $self->get_body_from_msg ($msg);
my $ret;
# we still tie for writing here, since we write to the seen db
# synchronously
eval {
local $SIG{'__DIE__'}; # do not run user die() traps in here
my $ok;
if ($self->{main}->{learn_to_journal}) {
# If we're going to learn to journal, we'll try going r/o first...
# If that fails for some reason, let's try going r/w. This happens
# if the DB doesn't exist yet.
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
} else {
$ok = $self->{store}->tie_db_writable();
}
if ($ok) {
$ret = $self->forget_trapped ($msg, $msgdata, $id);
if (!$self->{main}->{learn_caller_will_untie}) {
$self->{store}->untie_db();
}
}
};
if ($@) { # if we died, untie the dbs.
my $failure = $@;
$self->{store}->untie_db();
die $failure;
}
return $ret;
}
# this function is trapped by the wrapper above
sub forget_trapped {
my ($self, $msg, $msgdata, $msgid) = @_;
my @msgid = ( $msgid );
my $isspam;
if (!defined $msgid) {
@msgid = $self->get_msgid($msg);
}
while( $msgid = shift @msgid ) {
my $seen = $self->{store}->seen_get ($msgid);
if (defined ($seen)) {
if ($seen eq 's') {
$isspam = 1;
} elsif ($seen eq 'h') {
$isspam = 0;
} else {
dbg ("forget: msgid $msgid seen entry is neither ham nor spam, ignored");
return 0;
}
# messages should only be learned once, so stop if we find a msgid
# which was seen before
last;
}
else {
dbg ("forget: msgid $msgid not learnt, ignored");
}
}
# This message wasn't learnt before, so return
if (!defined $isspam) {
dbg("forget: no msgid from this message has been learnt, skipping message");
return 0;
}
elsif ($isspam) {
$self->{store}->nspam_nham_change (-1, 0);
}
else {
$self->{store}->nspam_nham_change (0, -1);
}
my $tokens = $self->tokenize($msg, $msgdata);
for my $token (keys %{$tokens}) {
if ($isspam) {
$self->{store}->tok_count_change (-1, 0, $token);
} else {
$self->{store}->tok_count_change (0, -1, $token);
}
}
$self->{store}->seen_delete ($msgid);
$self->{store}->cleanup();
$self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
isspam => $isspam,
msgid => $msgid,
});
1;
}
###########################################################################
sub get_msgid {
my ($self, $msg) = @_;
my @msgid = ();
my $msgid = $msg->get_header("Message-Id");
if (defined $msgid && $msgid ne '' && $msgid !~ /^\s*<\s*(?:\@sa_generated)?>.*$/) {
# remove \r and < and > prefix/suffixes
chomp $msgid;
$msgid =~ s/^<//; $msgid =~ s/>.*$//g;
push(@msgid, $msgid);
}
# Use sha1_hex(Date:, last received: and top N bytes of body)
# where N is MIN(1024 bytes, 1/2 of body length)
#
my $date = $msg->get_header("Date");
$date = "None" if (!defined $date || $date eq ''); # No Date?
my @rcvd = $msg->get_header("Received");
my $rcvd = $rcvd[$#rcvd];
$rcvd = "None" if (!defined $rcvd || $rcvd eq ''); # No Received?
# Make a copy since pristine_body is a reference ...
my $body = join('', $msg->get_pristine_body());
if (length($body) > 64) { # Small Body?
my $keep = ( length $body > 2048 ? 1024 : int(length($body) / 2) );
substr($body, $keep) = '';
}
unshift(@msgid, sha1_hex($date."\000".$rcvd."\000".$body).'@sa_generated');
return wantarray ? @msgid : $msgid[0];
}
sub get_body_from_msg {
my ($self, $msg) = @_;
if (!ref $msg) {
# I have no idea why this seems to happen. TODO
warn "msg not a ref: '$msg'";
return { };
}
$msg->extract_message_metadata ($self->{main});
my $permsgstatus =
Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus);
$permsgstatus->finish();
if (!defined $msgdata) {
# why?!
warn "failed to get body for ".scalar($self->get_msgid($self->{msg}))."\n";
return { };
}
return $msgdata;
}
sub get_msgdata_from_permsgstatus {
my ($self, $msg) = @_;
my $msgdata = { };
$msgdata->{bayes_token_body} = $msg->{msg}->get_visible_rendered_body_text_array();
$msgdata->{bayes_token_inviz} = $msg->{msg}->get_invisible_rendered_body_text_array();
@{$msgdata->{bayes_token_uris}} = $msg->get_uri_list();
return $msgdata;
}
###########################################################################
sub sync {
my ($self, $sync, $expire, $opts) = @_;
if (!$self->{conf}->{use_bayes}) { return 0; }
dbg("Syncing Bayes and expiring old tokens...");
$self->{store}->sync($opts) if ( $sync );
$self->{store}->expire_old_tokens($opts) if ( $expire );
dbg("Syncing complete.");
return 0;
}
###########################################################################
# compute the probability that that token is spammish
sub compute_prob_for_token {
my ($self, $token, $ns, $nn, $s, $n) = @_;
# we allow the caller to give us the token information, just
# to save a potentially expensive lookup
if (!defined($s) || !defined($n)) {
($s, $n, undef) = $self->{store}->tok_get ($token);
}
return if ($s == 0 && $n == 0);
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
return if ($s + $n < 10); # ignore low-freq tokens
}
if (!$self->{use_hapaxes}) {
return if ($s + $n < 2);
}
return if ( $ns == 0 || $nn == 0 );
my $ratios = ($s / $ns);
my $ration = ($n / $nn);
my $prob;
if ($ratios == 0 && $ration == 0) {
warn "oops? ratios == ration == 0";
return;
} else {
$prob = ($ratios) / ($ration + $ratios);
}
if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
# use Robinson's f(x) equation for low-n tokens, instead of just
# ignoring them
my $robn = $s+$n;
$prob = ($self->{robinson_s_times_x} + ($robn * $prob))
/
($self->{robinson_s_constant} + $robn);
}
if ($self->{log_raw_counts}) {
$self->{raw_counts} .= " s=$s,n=$n ";
}
return $prob;
}
###########################################################################
# If a token is neither hammy nor spammy, return 0.
# For a spammy token, return the minimum number of additional ham messages
# it would have had to appear in to no longer be spammy. Hammy tokens
# are handled similarly. That's what the function does (at the time
# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
# more useful if it returned the number of /additional/ ham messages
# a spammy token would have to appear in to no longer be spammy but I
# fear that might require the solution to a cubic equation, and I
# just don't have the time for that now.
sub compute_declassification_distance {
my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
return 0 if $ns == 0 && $nn == 0;
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
return 0 if $Ns == 0 || $Nn == 0;
return 0 if abs( $prob - 0.5 ) < $self->{robinson_min_prob_strength};
my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
my $p = 0.5 - $self->{robinson_min_prob_strength};
return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
my $s = $self->{robinson_s_constant};
my $sx = $self->{robinson_s_times_x};
my $a = $Nb * ( 1 - $p );
my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
my $discrim = $b * $b - 4 * $a * $c;
my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
# This shouldn't be necessary. Should not be < 1
return $dd_exact < 1 ? 1 : int($dd_exact);
}
# Check to make sure we can tie() the DB, and we have enough entries to do a scan
# if we're told the caller will untie(), go ahead and leave the db tied.
sub is_scan_available {
my $self = shift;
return 0 unless $self->{conf}->{use_bayes};
return 0 unless $self->{store}->tie_db_readonly();
# We need the DB to stay tied, so if the journal sync occurs, don't untie!
my $caller_untie = $self->{main}->{learn_caller_will_untie};
$self->{main}->{learn_caller_will_untie} = 1;
# Do a journal sync if necessary. Do this before the nspam_nham_get()
# call since the sync may cause an update in the number of messages
# learnt.
$self->opportunistic_calls(1);
# Reset the variable appropriately
$self->{main}->{learn_caller_will_untie} = $caller_untie;
my ($ns, $nn) = $self->{store}->nspam_nham_get();
if ($ns < $self->{conf}->{bayes_min_spam_num}) {
dbg("bayes: Not available for scanning, only $ns spam(s) in Bayes DB < ".$self->{conf}->{bayes_min_spam_num});
if (!$self->{main}->{learn_caller_will_untie}) {
$self->{store}->untie_db();
}
return 0;
}
if ($nn < $self->{conf}->{bayes_min_ham_num}) {
dbg("bayes: Not available for scanning, only $nn ham(s) in Bayes DB < ".$self->{conf}->{bayes_min_ham_num});
if (!$self->{main}->{learn_caller_will_untie}) {
$self->{store}->untie_db();
}
return 0;
}
return 1;
}
###########################################################################
# Finally, the scoring function for testing mail.
sub scan {
my ($self, $permsgstatus, $msg) = @_;
my $score;
# When we're doing a scan, we'll guarantee that we'll do the untie,
# so override the global setting until we're done.
my $caller_untie = $self->{main}->{learn_caller_will_untie};
$self->{main}->{learn_caller_will_untie} = 1;
goto skip if ($self->ignore_message($permsgstatus));
goto skip unless $self->is_scan_available();
my ($ns, $nn) = $self->{store}->nspam_nham_get();
if ($self->{log_raw_counts}) {
$self->{raw_counts} = " ns=$ns nn=$nn ";
}
dbg ("bayes corpus size: nspam = $ns, nham = $nn");
my $msgdata = $self->get_msgdata_from_permsgstatus ($permsgstatus);
my $msgtokens = $self->tokenize($msg, $msgdata);
my $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
my %pw;
foreach my $tokendata (@{$tokensdata}) {
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
my $prob = $self->compute_prob_for_token($token, $ns, $nn, $tok_spam, $tok_ham);
if (defined($prob)) {
$pw{$token}->{prob} = $prob;
$pw{$token}->{spam_count} = $tok_spam;
$pw{$token}->{ham_count} = $tok_ham;
$pw{$token}->{atime} = $atime;
}
}
# If none of the tokens were found in the DB, we're going to skip
# this message...
if (!keys %pw) {
dbg ("cannot use bayes on this message; none of the tokens were found in the database");
goto skip;
}
my $tcount_total = keys %{$msgtokens};
my $tcount_learned = keys %pw;
# Figure out the message receive time (used as atime below)
# If the message atime comes back as being in the future, something's
# messed up and we should revert to current time as a safety measure.
#
my $msgatime = $msg->receive_date();
my $now = time;
$msgatime = $now if ( $msgatime > $now );
# now take the $count most significant tokens and calculate probs using
# Robinson's formula.
my $count = N_SIGNIFICANT_TOKENS;
my @sorted = ();
my ($tcount_spammy,$tcount_hammy) = (0,0);
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
my @touch_tokens;
for (sort {
abs($pw{$b}->{prob} - 0.5) <=> abs($pw{$a}->{prob} - 0.5)
} keys %pw)
{
if ($count-- < 0) { last; }
my $pw = $pw{$_}->{prob};
next if (abs($pw - 0.5) < $self->{robinson_min_prob_strength});
# What's more expensive, scanning headers for HAMMYTOKENS and
# SPAMMYTOKENS tags that aren't there or collecting data that
# won't be used? Just collecting the data is certainly simpler.
#
my $raw_token = $msgtokens->{$_} || "(unknown)";
my $s = $pw{$_}->{spam_count};
my $n = $pw{$_}->{ham_count};
my $a = $pw{$_}->{atime};
push @$tinfo_spammy, [$raw_token,$pw,$s,$n,$a] if $pw >= 0.5 && ++$tcount_spammy;
push @$tinfo_hammy, [$raw_token,$pw,$s,$n,$a] if $pw < 0.5 && ++$tcount_hammy;
push (@sorted, $pw);
# update the atime on this token, it proved useful
push(@touch_tokens, $_);
dbg ("bayes token '$raw_token' => $pw");
}
if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
{
dbg ("cannot use bayes on this message; not enough usable tokens found");
goto skip;
}
if ($self->{use_chi_sq_combining}) {
$score = chi_squared_probs_combine ($ns, $nn, @sorted);
} else {
$score = robinson_naive_bayes_probs_combine (@sorted);
}
# Couldn't come up with a probability?
goto skip unless defined $score;
dbg ("bayes: score = $score");
# no need to call tok_touch_all unless there were significant
# tokens and a score was returned
# we don't really care about the return value here
$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
$permsgstatus->{bayes_nspam} = $ns;
$permsgstatus->{bayes_nham} = $nn;
if ($self->{log_raw_counts}) {
print " }
$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
probsref => \%pw,
score => $score,
msgatime => $msgatime,
significant_tokens => \@touch_tokens,
});
skip:
if (!defined $score) {
dbg ("bayes: not scoring message, returning undef");
}
$self->opportunistic_calls();
$self->{store}->cleanup();
$self->{main}->{learn_caller_will_untie} = $caller_untie;
if (!$caller_untie) {
$self->{store}->untie_db();
}
$permsgstatus->{tag_data}{BAYESTCHAMMY} = $tcount_hammy;
$permsgstatus->{tag_data}{BAYESTCSPAMMY} = $tcount_spammy;
$permsgstatus->{tag_data}{BAYESTCLEARNED} = $tcount_learned;
$permsgstatus->{tag_data}{BAYESTC} = $tcount_total;
return $score;
}
sub opportunistic_calls {
my($self, $journal_only) = @_;
if (!$self->{store}->db_readable()) {
dbg("bayes: opportunistic call attempt failed, DB not readable");
return;
}
my $running_expire = $self->{store}->get_running_expire_tok();
if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
return;
}
if (!$journal_only && $self->{store}->expiry_due()) {
dbg("bayes: opportunistic call found expiry due");
$self->sync(1,1);
}
elsif ( $self->{store}->sync_due() ) {
dbg("bayes: opportunistic call found journal sync due");
$self->sync(1,0);
if ($self->{store}->db_writable()) {
$self->{store}->remove_running_expire_tok();
}
}
return;
}
sub dbg { Mail::SpamAssassin::dbg (@_); }
sub sa_die { Mail::SpamAssassin::sa_die (@_); }
sub robinson_naive_bayes_probs_combine {
my (@sorted) = @_;
my $wc = scalar @sorted;
return unless $wc;
my $P = 1;
my $Q = 1;
foreach my $pw (@sorted) {
$P *= (1-$pw);
$Q *= $pw;
}
$P = 1 - ($P ** (1 / $wc));
$Q = 1 - ($Q ** (1 / $wc));
return (1 + ($P - $Q) / ($P + $Q)) / 2.0;
}
sub chi2q {
my ($x2, $v) = @_;
die "v must be even in chi2q(x2, v)" if $v & 1;
my $m = $x2 / 2.0;
my ($sum, $term);
$sum = $term = exp(0 - $m);
for my $i (1 .. (($v/2)-1)) {
$term *= $m / $i;
$sum += $term;
}
return $sum < 1.0 ? $sum : 1.0;
}
sub chi_squared_probs_combine {
my ($ns, $nn, @sorted) = @_;
my $wc = scalar @sorted;
return unless $wc;
my ($H, $S);
my ($Hexp, $Sexp);
$Hexp = $Sexp = 0;
my $totmsgs = ($ns + $nn);
if ($totmsgs == 0) { return; }
$S = ($ns / $totmsgs);
$H = ($nn / $totmsgs);
use POSIX qw(frexp);
foreach my $prob (@sorted) {
$S *= 1.0 - $prob;
$H *= $prob;
if ($S < 1e-200) {
my $e;
($S, $e) = frexp($S);
$Sexp += $e;
}
if ($H < 1e-200) {
my $e;
($H, $e) = frexp($H);
$Hexp += $e;
}
}
use constant LN2 => log(2);
$S = log($S) + $Sexp * LN2;
$H = log($H) + $Hexp * LN2;
$S = 1.0 - chi2q(-2.0 * $S, 2 * $wc);
$H = 1.0 - chi2q(-2.0 * $H, 2 * $wc);
return (($S - $H) + 1.0) / 2.0;
}
sub dump_bayes_db {
my($self, $magic, $toks, $regex) = @_;
return 0 unless $self->{store}->tie_db_readonly();
my @vars = $self->{store}->get_storage_variables();
my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
my $template = '%3.3f %10u %10u %10u %s'."\n";
if ( $magic ) {
printf ($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version');
printf ($template, 0.0, 0, $ns, 0, 'non-token data: nspam');
printf ($template, 0.0, 0, $nh, 0, 'non-token data: nham');
printf ($template, 0.0, 0, $nt, 0, 'non-token data: ntokens');
printf ($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime');
printf ($template, 0.0, 0, $na, 0, 'non-token data: newest atime') if ( $bv >= 2 );
printf ($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') if ( $bv < 2 );
printf ($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') if ( $bv >= 2 );
printf ($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime');
if ( $bv >= 2 ) {
printf ($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta');
printf ($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count');
}
}
if ( $toks ) {
$self->{store}->dump_db_toks($template, $regex, @vars);
}
if (!$self->{main}->{learn_caller_will_untie}) {
$self->{store}->untie_db();
}
return 1;
}
1;