AutoLearnThreshold.pm [plain text]
=head1 NAME
Mail::SpamAssassin::Plugin::AutoLearnThreshold - threshold-based discriminator for Bayes auto-learning
=head1 SYNOPSIS
loadplugin Mail::SpamAssassin::Plugin::AutoLearnThreshold
=head1 DESCRIPTION
This plugin implements the threshold-based auto-learning discriminator
for SpamAssassin's Bayes subsystem. Auto-learning is a mechanism
whereby high-scoring mails (or low-scoring mails, for non-spam) are fed
into its learning systems without user intervention, during scanning.
Note that certain tests are ignored when determining whether a message
should be trained upon:
=over 4
=item * rules with tflags set to 'learn' (the Bayesian rules)
=item * rules with tflags set to 'userconf' (user configuration)
=item * rules with tflags set to 'noautolearn'
=back
Also note that auto-learning occurs using scores from either scoreset 0
or 1, depending on what scoreset is used during message check. It is
likely that the message check and auto-learn scores will be different.
=cut
package Mail::SpamAssassin::Plugin::AutoLearnThreshold;
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Logger;
use strict;
use warnings;
use bytes;
use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin);
sub new {
my $class = shift;
my $mailsaobject = shift;
$class = ref($class) || $class;
my $self = $class->SUPER::new($mailsaobject);
bless ($self, $class);
$self->set_config($mailsaobject->{conf});
return $self;
}
sub set_config {
my($self, $conf) = @_;
my @cmds = ();
=head1 USER OPTIONS
The following configuration settings are used to control auto-learning:
=over 4
=item bayes_auto_learn_threshold_nonspam n.nn (default: 0.1)
The score threshold below which a mail has to score, to be fed into
SpamAssassin's learning systems automatically as a non-spam message.
=cut
push (@cmds, {
setting => 'bayes_auto_learn_threshold_nonspam',
default => 0.1,
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
});
=item bayes_auto_learn_threshold_spam n.nn (default: 12.0)
The score threshold above which a mail has to score, to be fed into
SpamAssassin's learning systems automatically as a spam message.
Note: SpamAssassin requires at least 3 points from the header, and 3
points from the body to auto-learn as spam. Therefore, the minimum
working value for this option is 6.
=cut
push (@cmds, {
setting => 'bayes_auto_learn_threshold_spam',
default => 12.0,
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
});
$conf->{parser}->register_commands(\@cmds);
}
sub autolearn_discriminator {
my ($self, $params) = @_;
my $scan = $params->{permsgstatus};
my $conf = $scan->{conf};
my $min = $conf->{bayes_auto_learn_threshold_nonspam};
my $max = $conf->{bayes_auto_learn_threshold_spam};
my $score = $scan->get_autolearn_points();
my $body_only_points = $scan->get_body_only_points();
my $head_only_points = $scan->get_head_only_points();
my $learned_points = $scan->get_learned_points();
dbg("learn: auto-learn? ham=$min, spam=$max, ".
"body-points=".$body_only_points.", ".
"head-points=".$head_only_points.", ".
"learned-points=".$learned_points);
my $isspam;
if ($score < $min) {
$isspam = 0;
} elsif ($score >= $max) {
$isspam = 1;
} else {
dbg("learn: auto-learn? no: inside auto-learn thresholds, not considered ham or spam");
return;
}
my $learner_said_ham_points = -1.0;
my $learner_said_spam_points = 1.0;
if ($isspam) {
my $required_body_points = 3;
my $required_head_points = 3;
if ($body_only_points < $required_body_points) {
dbg("learn: auto-learn? no: scored as spam but too few body points (".
$body_only_points." < ".$required_body_points.")");
return;
}
if ($head_only_points < $required_head_points) {
dbg("learn: auto-learn? no: scored as spam but too few head points (".
$head_only_points." < ".$required_head_points.")");
return;
}
if ($learned_points < $learner_said_ham_points) {
dbg("learn: auto-learn? no: scored as spam but learner indicated ham (".
$learned_points." < ".$learner_said_ham_points.")");
return;
}
if (!$scan->is_spam()) {
dbg("learn: auto-learn? no: scored as ham but autolearn wanted spam");
return;
}
} else {
if ($learned_points > $learner_said_spam_points) {
dbg("learn: auto-learn? no: scored as ham but learner indicated spam (".
$learned_points." > ".$learner_said_spam_points.")");
return;
}
if ($scan->is_spam()) {
dbg("learn: auto-learn? no: scored as spam but autolearn wanted ham");
return;
}
}
dbg("learn: auto-learn? yes, ".($isspam?"spam ($score > $max)":"ham ($score < $min)"));
return $isspam;
}
1;
=back
=cut