=head1 NAME

Blog::Spam::Plugin::stopwords - Reject comments which contain known-bad words.

=cut

=head1 ABOUT

This plugin is designed to discard comments which contain terms listed
in a local blacklist.

=cut

=head1 DETAILS

The blacklist must be created and updated by the maintainer of the
local system, and will be found at B</etc/blogspam/stoplist>.

=cut

=head1 AUTHOR

=over 4

=item Steve Kemp

http://www.steve.org.uk/

=back

=cut

=head1 LICENSE

Copyright (c) 2008-2010 by Steve Kemp.  All rights reserved.

This module is free software;
you can redistribute it and/or modify it under
the same terms as Perl itself.
The LICENSE file contains the full text of the license.

=cut




package Blog::Spam::Plugin::stopwords;


use strict;
use warnings;


=begin doc

Constructor.  Called when this plugin is instantiated.

=end doc

=cut

sub new
{
    my ( $proto, %supplied ) = (@_);
    my $class = ref($proto) || $proto;

    my $self = {};

    # verbose?
    $self->{ 'verbose' } = $supplied{ 'verbose' } || 0;

    # the file of stop-words
    $self->{ 'stopwords' } = "/etc/blogspam/stopwords";

    bless( $self, $class );
    return $self;
}


=begin doc

Block if we find some particular stop-words in the body of the message.

=end doc

=cut

sub testComment
{
    my ( $self, %params ) = (@_);

    #
    #  Get the body content, and split it by the
    # whitespace.
    #
    my $body = lc( $params{ 'comment' } );

    #
    #  If we have a name or subject append that too.
    #
    foreach my $term (qw! subject name !)
    {
        my $extra = $params{ $term };
        if ( defined($extra) && length($extra) )
        {
            $body .= " " . lc($extra) . " ";
        }
    }

    #
    #  The stop-words come from this file.
    #
    my $file = $self->{ 'stopwords' } || undef;

    #
    #  No file?  Then return.
    #
    return "OK" if ( !defined($file) || !-e $file );


    #
    #  Get the mtime of the file.
    #
    my ( $dev,  $ino,   $mode,  $nlink, $uid,     $gid, $rdev,
         $size, $atime, $mtime, $ctime, $blksize, $blocks
       ) = stat($file);

    #
    #  If we've not loaded, or the file modification time has
    # changed then reload.
    #
    if ( !$self->{ 'wmtime' } ||
         $self->{ 'wmtime' } < $mtime )
    {
        $self->{ 'words' } = undef;

        if ( open( WORDS, "<", $file ) )
        {

            $self->{ 'words' } = undef;
            $self->{ 'verbose' } &&
              print "re-reading stopwords from $file\n";

            while ( my $line = <WORDS> )
            {

                # skip blank lines
                next unless ( $line && length($line) );

                # Skip lines beginning with comments
                next if ( $line =~ /^([ \t]*)\#/ );

                # strip spaces
                $line =~ s/^\s+|\s+$//g;

                # strip newline
                chomp($line);

                # empty now?
                next unless length($line);

                $self->{ 'words' }{ lc $line } = 1;

                $self->{ 'verbose' } && print "Blacklisting term: $line\n";

            }
            close(WORDS);
            $self->{ 'wmtime' } = $mtime;
        }
    }

    #
    #  Split the comment into tokens.  Simplistically.
    #
    foreach my $word ( split( /[ \t\r\n]/, $body ) )
    {

        #
        # remove non letters - we already lower-cased so
        # this just removes leading/trailing punctution,etc.
        #
        $word =~ s/[^a-z]//g;

        next if ( !length($word) );

        if ( $self->{ 'words' }{ $word } )
        {
            return "SPAM:stopwords:$word";
        }
    }

    return "OK";
}


1;