package Mail::SpamTest::Bayesian; =head1 NAME Mail::SpamTest::Bayesian - Perl extension for Bayesian spam-testing =head1 SYNOPSIS use Mail::SpamTest::Bayesian; my $j=Mail::SpamTest::Bayesian->new(dir => '.'); $j->init_db; $j->merge_mbox_spam($scalar_spam_box); $j->merge_mbox_nonspam($scalar_nonspam_box); $message=$j->markup_message($message); =head1 DESCRIPTION This module implements the Bayesian spam-testing algorithm described by Paul Graham at: http://www.paulgraham.com/spam.html In short: the system is trained by exposure to mailboxes of known spam and non-spam messages. These are (1) MIME-decoded, and non-text parts deleted; (2) tokenised. The database files spam.db and nonspam.db contain lists of tokens and the number of messages in which they have occurred; general.db holds a message count. This module is in early development; it is functional but basic. It is expected that more mailbox parsing routines will be added, probably using Mail::Box; and that ancillary programs will be supplied for use of the module as a personal mail filter. =head1 METHODS =cut use 5.006; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our $VERSION = '0.01'; use strict; use BerkeleyDB; # libberkeleydb-perl use MIME::Parser; # libmime-perl =head2 new() Standard constructor. Pass a hash or hashref with parameters. Useful parameters: dir -> database directory (.) significant -> number of significant tokens to consider (15) threshold -> spam threshold (0.9) fudgefactor -> Non-spam priority (2) =cut sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self={}; bless ($self, $class); $self->{dir}='.'; $self->{significant}=15; $self->{threshold}=0.9; $self->{fudgefactor}=2; my @param; while (my $p=shift) { if (ref($p) eq 'HASH') { map {$self->{lc($_)}=$p->{$_}} keys %{$p}; } else { my $v=shift; $self->{$p}=$v; } } foreach my $db (qw(spam nonspam general)) { $self->{$db}=new BerkeleyDB::Hash( -Filename => "$self->{dir}/$db.db", -Flags => DB_CREATE ); } $self->{parser}=new MIME::Parser; $self->{parser}->output_to_core(1); $self->{parser}->tmp_to_core(1); $self->{parser}->tmp_recycling(1); return $self; } =head2 init_db() Resets databases. Note that this will not recover space - if you want to delete an existing database, just delete the three files general.db, spam.db and nonspam.db. Call this only once, when you first set up the database. =cut sub init_db { my $self=shift; foreach my $db (qw(spam nonspam general)) { my ($k,$v); my $cursor=$self->{$db}->db_cursor; while ($cursor->c_get($k, $v, DB_NEXT) == 0) { $cursor->c_del; } } $self->{general}->db_put('spam',0); $self->{general}->db_put('nonspam',0); } =head2 merge_mbox_spam() Train the system by giving it a mailbox full of spam. Pass a scalar or array or arrayref containing raw messages. =cut sub merge_mbox_spam { my $self=shift; $self->merge_mbox(1,@_); } =head2 merge_mbox_nonspam() Train the system by giving it a mailbox full of legitimate email. Pass a scalar or array or arrayref containing raw messages. =cut sub merge_mbox_nonspam { my $self=shift; $self->merge_mbox(0,@_); } sub merge_mbox { my $self=shift; my $spamstate=shift; my @message=@_; if (scalar @message == 1) { my $m=$message[0]; if (ref($m) eq 'ARRAY') { @message=@{$m}; $m=''; } elsif (ref($m) eq 'SCALAR') { $m=$$m; } if ($m ne '') { @message=map {"From $_"} grep !/^$/, (split /^From /m,$m); } } foreach my $m (@message) { $self->merge_message($spamstate,$m); } } =head2 merge_message_spam() As merge_mbox_spam, but for a single message; pass in a scalar. =cut sub merge_message_spam { my $self=shift; $self->merge_message(1,@_); } =head2 merge_message_nonspam() As merge_mbox_nonspam, but for a single message; pass in a scalar. =cut sub merge_message_nonspam { my $self=shift; $self->merge_message(0,@_); } sub merge_message { my $self=shift; my $spamstate=shift; my $message=shift; my @tokens=$self->_tokenise_message($message); @tokens=keys %{{ map {$_ => 1} @tokens }}; my $sk=($spamstate==1)?'spam':'nonspam'; foreach my $t (@tokens) { my $old; if ($self->{$sk}->db_get($t,$old) == 0) { $old++; } else { $old=1; } $self->{$sk}->db_put($t,$old); delete $self->{tokencache}->{$t}; } my $old; $self->{general}->db_get($sk,$old); $old++; $self->{general}->db_put($sk,$old); } =head2 markup_message() Test a message for possible spammishness. Pass a scalar containing a single message. Will return the original message with inserted headers: X-Bayesian-Spam: (YES|NO) (probability%) X-Bayesian-Test: the significant tests and their weights =cut sub markup_message { my $self=shift; my $message=shift; my ($spam,$prob,$list)=$self->test_message($message); my $text=($spam)?'YES':'NO'; $prob=sprintf("%.1f",100*$prob); $message =~ s/^$/X-Bayesian-Spam: $text ($prob%)\n/m; $text=join(', ',@{$list}); $message =~ s/^$/X-Bayesian-Test: $text\n/m; return $message; } =head2 test_message() Pass a scalar containing a single message. Returns a list: 0: spam status (1 for spam, 0 for non spam) 1: probability of spam 2: listref of significant tests =cut sub test_message { my $self=shift; my $message=shift; my @tokens=$self->_tokenise_message($message); my %total; foreach my $mode (qw(spam nonspam)) { if ($self->{general}->db_get($mode,$total{$mode})) { $total{$mode}=0; } } foreach my $token (@tokens) { unless (exists $self->{tokencache}->{$token}) { $self->{tokencache}->{$token}=0.2; my %this; foreach my $mode (qw(spam nonspam)) { if ($self->{$mode}->db_get($token,$this{$mode})) { $this{$mode}=0; } } $this{nonspam}*=$self->{fudgefactor}; if ($this{spam}+$this{nonspam}>5) { $self->{tokencache}->{$token}= &_max(0.01,&_min(0.99, &_min($this{spam}/$total{spam},1)/ (&_min($this{nonspam}/$total{nonspam},1)+ &_min($this{spam}/$total{spam},1)) )); } } } my @toklist=sort {abs($self->{tokencache}->{$Mail::SpamTest::Bayesian::b}-0.5) <=> abs($self->{tokencache}->{$Mail::SpamTest::Bayesian::a}-0.5)} @tokens; @toklist=@toklist[0..($self->{significant}-1)]; my $p=0.5; foreach (map {$self->{tokencache}->{$_}} @toklist) { $p *= $_ / ( ($p*$_) + ((1-$p) * (1-$_))); } my $s=0; if ($p >= $self->{threshold}) { $s=1; } @toklist=map {"$_ (".sprintf('%.3f',$self->{tokencache}->{$_}).")"} sort {$self->{tokencache}->{$Mail::SpamTest::Bayesian::a} <=> $self->{tokencache}->{$Mail::SpamTest::Bayesian::b} || $Mail::SpamTest::Bayesian::a cmp $Mail::SpamTest::Bayesian::b} @toklist; return ($s,$p,\@toklist); } sub _tokenise_message { my $self=shift; my ($message)=@_; my $data=$self->{parser}->parse_data($message); my @keep=grep { $_->mime_type =~ /^text\/(plain|html)$/ } $data->parts; $data->parts(\@keep); my @message=($data->head->as_string); for (my $i = 0; $i < $data->parts; $i++) { my $ent = $data->parts($i); if (my $io = $ent->open("r")) { while (defined(my $line = $io->getline)) { push(@message, $line); } $io->close; } } my @token; foreach my $line (@message) { foreach my $token (split /[^-\$A-Za-z0-9\']+/,$line) { if ($token ne '') { push @token,$token; } } } return @token; } sub _min { my @t=@_; my $a=$t[0]; foreach my $b (@t[1..$#t]) { if ($b<$a) { $a=$b; } } return $a; } sub _max { my @t=@_; my $a=$t[0]; foreach my $b (@t[1..$#t]) { if ($b>$a) { $a=$b; } } return $a; } 1; __END__ =head1 AUTHOR Roger Burton West, E<lt>roger@firedrake.orgE<gt> =head1 ACKNOWLEDGEMENTS Erwin Harte provided useful feedback and the de-MIMEing code. =head1 SEE ALSO L<perl>, L<BerkeleyDB>. =cut