# <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =head1 NAME Mail::SpamAssassin::BayesStore - Bayesian Storage Module =head1 DESCRIPTION This is the public API for the Bayesian store methods. Any implementation of the storage module must implement these methods. =cut package Mail::SpamAssassin::BayesStore; use strict; use warnings; use bytes; use Mail::SpamAssassin::Logger; # TODO: if we ever get tuits, it'd be good to make these POD # method docs more perlish... hardly a biggie. =head1 METHODS =over 4 =item new public class (Mail::SpamAssassin::BayesStore) new (Mail::SpamAssassin::Bayes $bayes) Description: This method creates a new instance of the Mail::SpamAssassin::BayesStore object. You must pass in an instance of the Mail::SpamAssassin:Bayes object, which is stashed for use throughout the module. =cut sub new { my ($class, $bayes) = @_; $class = ref($class) || $class; my $self = { 'bayes' => $bayes, 'supported_db_version' => 0, 'db_version' => undef, }; bless ($self, $class); $self; } =item DB_VERSION public instance (Integer) DB_VERSION () Description: This method returns the currently supported database version for the implementation. =cut sub DB_VERSION { my ($self) = @_; return $self->{supported_db_version}; } =item read_db_configs public instance () read_db_configs () Description: This method reads any needed config variables from the configuration object and then calls the Mail::SpamAssassin::Bayes read_db_configs method. =cut sub read_db_configs { my ($self) = @_; # TODO: at some stage, this may be useful to read config items which # control database bloat, like # # - use of hapaxes # - use of case-sensitivity # - more midrange-hapax-avoidance tactics when parsing headers (future) # # for now, we just set these settings statically. my $conf = $self->{bayes}->{main}->{conf}; # Minimum desired database size? Expiry will not shrink the # database below this number of entries. 100k entries is roughly # equivalent to a 5Mb database file. $self->{expiry_max_db_size} = $conf->{bayes_expiry_max_db_size}; $self->{expiry_pct} = $conf->{bayes_expiry_pct}; $self->{expiry_period} = $conf->{bayes_expiry_period}; $self->{expiry_max_exponent} = $conf->{bayes_expiry_max_exponent}; $self->{bayes}->read_db_configs(); } =item tie_db_readonly public instance (Boolean) tie_db_readonly () Description: This method opens up the database in readonly mode. =cut sub tie_db_readonly { my ($self) = @_; die "bayes: tie_db_readonly: not implemented\n"; } =item tie_db_writable public instance (Boolean) tie_db_writable () Description: This method opens up the database in writable mode. Any callers of this methods should ensure that they call untie_db() afterwards. =cut sub tie_db_writable { my ($self) = @_; die "bayes: tie_db_writable: not implemented\n"; } =item untie_db public instance () untie_db () Description: This method unties the database. =cut sub untie_db { my $self = shift; die "bayes: untie_db: not implemented\n"; } =item calculate_expire_delta public instance (%) calculate_expire_delta (Integer $newest_atime, Integer $start, Integer $max_expire_mult) Description: This method performs a calculation on the data to determine the optimum atime for token expiration. =cut sub calculate_expire_delta { my ($self, $newest_atime, $start, $max_expire_mult) = @_; die "bayes: calculate_expire_delta: not implemented\n"; } =item token_expiration public instance (Integer, Integer, Integer, Integer) token_expiration(\% $opts, Integer $newest_atime, Integer $newdelta) Description: This method performs the database specific expiration of tokens based on the passed in C<$newest_atime> and C<$newdelta>. =cut sub token_expiration { my ($self, $opts, $newest_atime, $newdelta) = @_; die "bayes: token_expiration: not implemented\n"; } =item expire_old_tokens public instance (Boolean) expire_old_tokens (\% hashref) Description: This method expires old tokens from the database. =cut sub expire_old_tokens { my ($self, $opts) = @_; my $ret; eval { local $SIG{'__DIE__'}; # do not run user die() traps in here if ($self->tie_db_writable()) { $ret = $self->expire_old_tokens_trapped ($opts); } }; my $err = $@; if (!$self->{bayes}->{main}->{learn_caller_will_untie}) { $self->untie_db(); } if ($err) { # if we died, untie the dbs. warn "bayes: expire_old_tokens: $err\n"; return 0; } $ret; } =item expire_old_tokens_trapped public instance (Boolean) expire_old_tokens_trapped (\% $opts) Description: This methods does the actual token expiration. XXX More docs here about the methodology and what not =cut sub expire_old_tokens_trapped { my ($self, $opts) = @_; # Flag that we're doing work $self->set_running_expire_tok(); # We don't need to do an expire, so why were we called? Oh well. if (!$self->expiry_due()) { $self->remove_running_expire_tok(); return 0; } my $started = time(); my @vars = $self->get_storage_variables(); if ( $vars[10] > time ) { dbg("bayes: expiry found newest atime in the future, resetting to current time"); $vars[10] = time; } # How many tokens do we want to keep? my $goal_reduction = int($self->{expiry_max_db_size} * $self->{expiry_pct}); dbg("bayes: expiry check keep size, ".$self->{expiry_pct}." * max: $goal_reduction"); # Make sure we keep at least 100000 tokens in the DB if ( $goal_reduction < 100000 ) { $goal_reduction = 100000; dbg("bayes: expiry keep size too small, resetting to 100,000 tokens"); } # Now turn goal_reduction into how many to expire. $goal_reduction = $vars[3] - $goal_reduction; dbg("bayes: token count: ".$vars[3].", final goal reduction size: $goal_reduction"); if ( $goal_reduction < 1000 ) { # too few tokens to expire, abort. dbg("bayes: reduction goal of $goal_reduction is under 1,000 tokens, skipping expire"); $self->set_last_expire(time()); $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now. return 1; # we want to indicate things ran as expected } # Estimate new atime delta based on the last atime delta my $newdelta = 0; if ( $vars[9] > 0 ) { # newdelta = olddelta * old / goal; # this may seem backwards, but since we're talking delta here, # not actual atime, we want smaller atimes to expire more tokens, # and visa versa. # $newdelta = int($vars[8] * $vars[9] / $goal_reduction); } # Calculate size difference between last expiration token removal # count and the current goal removal count. my $ratio = ($vars[9] == 0 || $vars[9] > $goal_reduction) ? $vars[9]/$goal_reduction : $goal_reduction/$vars[9]; dbg("bayes: first pass? current: ".time().", Last: ".$vars[4].", atime: ".$vars[8].", count: ".$vars[9].", newdelta: $newdelta, ratio: $ratio, period: ".$self->{expiry_period}); ## ESTIMATION PHASE # # Do this for the first expire or "odd" looking results cause a first pass to determine atime: # # - last expire was more than 30 days ago # assume mail flow stays roughly the same month to month, recompute if it's > 1 month # - last atime delta was under expiry period # if we're expiring often max_db_size should go up, but let's recompute just to check # - last reduction count was < 1000 tokens # ditto # - new estimated atime delta is under expiry period # ditto # - difference of last reduction to current goal reduction is > 50% # if the two values are out of balance, estimating atime is going to be funky, recompute # if ( (time() - $vars[4] > 86400*30) || ($vars[8] < $self->{expiry_period}) || ($vars[9] < 1000) || ($newdelta < $self->{expiry_period}) || ($ratio > 1.5) ) { dbg("bayes: can't use estimation method for expiry, unexpected result, calculating optimal atime delta (first pass)"); my $start = $self->{expiry_period}; # exponential search starting at ...? 1/2 day, 1, 2, 4, 8, 16, ... my $max_expire_mult = 2**$self->{expiry_max_exponent}; # $max_expire_mult * $start = max expire time (256 days), power of 2. dbg("bayes: expiry max exponent: ".$self->{expiry_max_exponent}); my %delta = $self->calculate_expire_delta($vars[10], $start, $max_expire_mult); return 0 unless (%delta); # This will skip the for loop if debugging isn't enabled ... if (would_log('dbg', 'bayes')) { dbg("bayes: atime\ttoken reduction"); dbg("bayes: ========\t==============="); for(my $i = 1; $i<=$max_expire_mult; $i <<= 1) { dbg("bayes: ".$start*$i."\t".(exists $delta{$i} ? $delta{$i} : 0)); } } # Now figure out which max_expire_mult value gives the closest results to goal_reduction, without # going over ... Go from the largest delta backwards so the reduction size increases # (tokens that expire at 4 also expire at 3, 2, and 1, so 1 will always be the largest expiry...) # for( ; $max_expire_mult > 0; $max_expire_mult>>=1 ) { next unless exists $delta{$max_expire_mult}; if ($delta{$max_expire_mult} > $goal_reduction) { $max_expire_mult<<=1; # the max expire is actually the next power of 2 out last; } } # if max_expire_mult gets to 0, either we can't expire anything, or 1 is <= $goal_reduction $max_expire_mult ||= 1; # $max_expire_mult is now equal to the value we should use ... # Check to see if the atime value we found is really good. # It's not good if: # - $max_expire_mult would not expire any tokens. This means that the majority of # tokens are old or new, and more activity is required before an expiry can occur. # - reduction count < 1000, not enough tokens to be worth doing an expire. # if ( !exists $delta{$max_expire_mult} || $delta{$max_expire_mult} < 1000 ) { dbg("bayes: couldn't find a good delta atime, need more token difference, skipping expire"); $self->set_last_expire(time()); $self->remove_running_expire_tok(); # this won't be cleaned up, so do it now. return 1; # we want to indicate things ran as expected } $newdelta = $start * $max_expire_mult; dbg("bayes: first pass decided on $newdelta for atime delta"); } else { # use the estimation method dbg("bayes: can do estimation method for expiry, skipping first pass"); } my ($kept, $deleted, $num_hapaxes, $num_lowfreq) = $self->token_expiration($opts, $newdelta, @vars); my $done = time(); my $msg = "expired old bayes database entries in ".($done - $started)." seconds"; my $msg2 = "$kept entries kept, $deleted deleted"; if ($opts->{verbose}) { my $hapax_pc = ($num_hapaxes * 100) / $kept; my $lowfreq_pc = ($num_lowfreq * 100) / $kept; print "$msg\n$msg2\n"; printf "token frequency: 1-occurrence tokens: %3.2f%%\n", $hapax_pc; printf "token frequency: less than 8 occurrences: %3.2f%%\n", $lowfreq_pc; } else { dbg("bayes: $msg: $msg2"); } return 1; } =item sync_due public instance (Boolean) sync_due () Description: This methods determines if a sync is due. =cut sub sync_due { my ($self) = @_; die "bayes: sync_due: not implemented\n"; } =item expiry_due public instance (Boolean) expiry_due () Description: This methods determines if an expire is due. =cut sub expiry_due { my ($self) = @_; $self->read_db_configs(); # make sure this has happened here # If force expire was called, do the expire no matter what. return 1 if ($self->{bayes}->{main}->{learn_force_expire}); # if config says not to auto expire then no need to continue return 0 if ($self->{bayes}->{main}->{conf}->{bayes_auto_expire} == 0); # is the database too small for expiry? (Do *not* use "scalar keys", # as this will iterate through the entire db counting them!) my @vars = $self->get_storage_variables(); my $ntoks = $vars[3]; my $last_expire = time() - $vars[4]; if (!$self->{bayes}->{main}->{ignore_safety_expire_timeout}) { # if we're not ignoring the safety timeout, don't run an expire more # than once every 12 hours. return 0 if ($last_expire < 43200); } else { # if we are ignoring the safety timeout (e.g.: mass-check), still # limit the expiry to only one every 5 minutes. return 0 if ($last_expire < 300); } dbg("bayes: DB expiry: tokens in DB: $ntoks, Expiry max size: ".$self->{expiry_max_db_size}.", Oldest atime: ".$vars[5].", Newest atime: ".$vars[10].", Last expire: ".$vars[4].", Current time: ".time(),'bayes','-1'); my $conf = $self->{bayes}->{main}->{conf}; if ($ntoks <= 100000 || # keep at least 100k tokens $self->{expiry_max_db_size} > $ntoks || # not enough tokens to cause an expire $vars[10]-$vars[5] < 43200 || # delta between oldest and newest < 12h $self->{db_version} < $self->DB_VERSION # ignore old db formats ) { return 0; } return 1; } =item seen_get public instance (Char) seen_get (String $msgid) Description: This method retrieves the stored value, if any, for C<$msgid>. The return value is the stored string ('s' for spam and 'h' for ham) or undef if C<$msgid> is not found. =cut sub seen_get { my ($self, $msgid) = @_; die "bayes: seen_get: not implemented\n"; } =item seen_put public instance (Boolean) seen_put (String $msgid, Char $flag) Description: This method records C<$msgid> as the type given by C<$flag>. C<$flag> is one of two values 's' for spam and 'h' for ham. =cut sub seen_put { my ($self, $msgid, $flag) = @_; die "bayes: seen_put: not implemented\n"; } =item seen_delete public instance (Boolean) seen_delete (String $msgid) Description: This method removes C<$msgid> from storage. =cut sub seen_delete { my ($self, $msgid) = @_; die "bayes: seen_delete: not implemented\n"; } =item get_storage_variables public instance (@) get_storage_variables () Description: This method retrieves the various administrative variables used by the Bayes storage implementation. The values returned in the array are in the following order: 0: scan count base 1: number of spam 2: number of ham 3: number of tokens in db 4: last expire atime 5: oldest token in db atime 6: db version value 7: last journal sync 8: last atime delta 9: last expire reduction count 10: newest token in db atime =cut sub get_storage_variables { my ($self) = @_; die "bayes: get_storage_variables: not implemented\n"; } =item dump_db_toks public instance () dump_db_toks (String $template, String $regex, @ @vars) Description: This method loops over all tokens, computing the probability for the token and then printing it out according to the passed in template. =cut sub dump_db_toks { my ($self, $template, $regex, @vars) = @_; die "bayes: dump_db_toks: not implemented\n"; } =item set_last_expire public instance (Boolean) _set_last_expire (Integer $time) Description: This method sets the last expire time. =cut sub set_last_expire { my ($self, $time) = @_; die "bayes: set_last_expire: not implemented\n"; } =item get_running_expire_tok public instance (Time) get_running_expire_tok () Description: This method determines if an expire is currently running and returns the time the expire started. =cut sub get_running_expire_tok { my ($self) = @_; die "bayes: get_running_expire_tok: not implemented\n"; } =item set_running_expire_tok public instance (Time) set_running_expire_tok () Description: This method sets the running expire time to the current time. =cut sub set_running_expire_tok { my ($self) = @_; die "bayes: set_running_expire_tok: not implemented\n"; } =item remove_running_expire_tok public instance (Boolean) remove_running_expire_tok () Description: This method removes a currently set running expire time. =cut sub remove_running_expire_tok { my ($self) = @_; die "bayes: remove_running_expire_tok: not implemented\n"; } =item tok_get public instance (Integer, Integer, Time) tok_get (String $token) Description: This method retrieves the specified token (C<$token>) from storage and returns it's spam count, ham acount and last access time. =cut sub tok_get { my ($self, $token) = @_; die "bayes: tok_get: not implemented\n"; } =item tok_get_all public instance (\@) tok_get_all (@ @tokens) Description: This method retrieves the specified tokens (C<@tokens>) from storage and returns an array ref of arrays spam count, ham acount and last access time. =cut sub tok_get_all { my ($self, $tokens) = @_; die "bayes: tok_get_all: not implemented\n"; } =item tok_count_change public instance (Boolean) tok_count_change (Integer $spam_count, Integer $ham_count, String $token, Time $atime) Description: This method takes a C<$spam_count> and C<$ham_count> and adds it to C<$token> along with updating C<$token>s atime with C<$atime>. =cut sub tok_count_change { my ($self, $spam_count, $ham_count, $token, $atime) = @_; die "bayes: tok_count_change: not implemented\n"; } =item multi_tok_count_change public instance (Boolean) multi_tok_count_change (Integer $spam_count, Integer $ham_count, \% $tokens, String $atime) Description: This method takes a C<$spam_count> and C<$ham_count> and adds it to all of the tokens in the C<$tokens> hash ref along with updating each tokens atime with C<$atime>. =cut sub multi_tok_count_change { my ($self, $spam_count, $ham_count, $tokens, $atime) = @_; die "bayes: multi_tok_count_change: not implemented\n"; } =item nspam_nham_get public instance (Integer, Integer) nspam_nham_get () Description: This method retrieves the total number of spam and the total number of spam currently under storage. =cut sub nspam_nham_get { my ($self) = @_; die "bayes: nspam_nham_get: not implemented\n"; } =item nspam_nham_change public instance (Boolean) nspam_nham_change (Integer $num_spam, Integer $num_ham) Description: This method updates the number of spam and the number of ham in the database. =cut sub nspam_nham_change { my ($self, $num_spam, $num_ham) = @_; die "bayes: nspam_nham_change: not implemented\n"; } =item tok_touch public instance (Boolean) tok_touch (String $token, Time $atime) Description: This method updates the given tokens (C<$token>) access time. =cut sub tok_touch { my ($self, $token, $atime) = @_; die "bayes: tok_touch: not implemanted\n"; } =item tok_touch_all public instance (Boolean) tok_touch_all (\@ $tokens, Time $atime) Description: This method does a mass update of the given list of tokens C<$tokens>, if the existing token atime is < C<$atime>. =cut sub tok_touch_all { my ($self, $tokens, $atime) = @_; die "bayes: tok_touch_all: not implemanted\n"; } =item cleanup public instance (Boolean) cleanup () Description: This method performs any cleanup necessary before moving onto the next operation. =cut sub cleanup { my ($self) = @_; die "bayes: cleanup: not implemented\n"; } =item get_magic_re public instance get_magic_re (String) Description: This method returns a regexp which indicates a magic token. =cut sub get_magic_re { my ($self) = @_; die "bayes: get_magic_re: not implemented\n"; } =item sync public instance (Boolean) sync (\% $opts) Description: This method performs a sync of the database. =cut sub sync { my ($self, $opts) = @_; die "bayes: sync: not implemented\n"; } =item perform_upgrade public instance (Boolean) perform_upgrade (\% $opts) Description: This method is a utility method that performs any necessary upgrades between versions. It should know how to handle previous versions and what needs to happen to upgrade them. A true return value indicates success. =cut sub perform_upgrade { my ($self, $opts) = @_; die "bayes: perform_upgrade: not implemented\n"; } =item clear_database public instance (Boolean) clear_database () Description: This method deletes all records for a particular user. Callers should be aware that any errors returned by this method could causes the database to be inconsistent for the given user. =cut sub clear_database { my ($self) = @_; die "bayes: clear_database: not implemented\n"; } =item backup_database public instance (Boolean) backup_database () Description: This method will dump the users database in a marchine readable format. =cut sub backup_database { my ($self) = @_; die "bayes: backup_database: not implemented\n"; } =item restore_database public instance (Boolean) restore_database (String $filename, Boolean $showdots) Description: This method restores a database from the given filename, C<$filename>. Callers should be aware that any errors returned by this method could causes the database to be inconsistent for the given user. =cut sub restore_database { my ($self, $filename, $showdots) = @_; die "bayes: restore_database: not implemented\n"; } =item db_readable public instance (Boolean) db_readable () Description: This method returns whether or not the Bayes DB is available in a readable state. =cut sub db_readable { my ($self) = @_; die "bayes: db_readable: not implemented\n"; } =item db_writable public instance (Boolean) db_writable () Description: This method returns whether or not the Bayes DB is available in a writable state. =cut sub db_writable { my ($self) = @_; die "bayes: db_writable: not implemented\n"; } sub sa_die { Mail::SpamAssassin::sa_die(@_); } 1; =back =cut