#!/usr/bin/perl

=begin metadata

Name: spell
Description: scan a file for misspelled words
Author: Greg Snow, snow@biostat.washington.edu
License: perl

=end metadata

=cut


use strict;

use File::Basename qw(basename);
use Getopt::Std qw(getopts);

use constant EX_SUCCESS => 0;
use constant EX_FAILURE => 1;

use constant DICT_FILE => '/usr/dict/words';

my $Program = basename($0);

my (
    %words,    # words from dictionary
    %check,    # words to be checked
    @supp,     # file(s) of supplemental dictionaries/words
    @keys,     # keys of the %words hash
    $check,    # Indicator if close matches should be found
    $inter,    # Indicator if in "interactive mode
    $suff,     # Indicator if suffixes should be checked
);

sub usage {
  warn "usage: $Program [-d dict] [-c|-x] [-v] [-i] [+extra_list] [file ...]\n";
  exit EX_FAILURE;
}

keys %words = 45402;   # allocate bins for the hash, it may be useful to
                       #  change this if you dict file is larger or smaller.

my %opt;
getopts('cd:ixv', \%opt) or usage();
$check = 1 if $opt{'c'} || $opt{'x'};
$suff = 1 if $opt{'v'};
$inter = 1 if $opt{'i'};
@supp = ( DICT_FILE );
@supp = ( $opt{'d'} ) if defined $opt{'d'};
for (reverse (0 .. $#ARGV)) {
  if ($ARGV[$_] =~ m/\A\+./) {
    push @supp, substr($ARGV[$_], 1);
    splice @ARGV, $_, 1;
  }
}
# read in dictionary words
for my $dict_file (@supp) {
  next if -d $dict_file;
  my $in;
  unless (open $in, '<', $dict_file) {
    warn "$Program: could not open dictionary <$dict_file>: $!\n";
    exit EX_FAILURE;
  }
  while (<$in>) {
    chomp;
    my $w = lc;
    $words{$w} = 1;
  }
  close $in;
}
@keys = keys %words;
unless (@keys) {
  warn "$Program: word list was empty\n";
  exit EX_FAILURE;
}

# Read data to check

if ($inter) {
  print "Word(s): ";
}

while (<>) {
  chomp;
  last if /^\s*$/ && $inter;
  $_ = lc $_;
  my @tmp_words =grep {/[^\W\d_]/} split;
  for (@tmp_words) {
    s/^[\W_]+//;      # strip out punctuation, etc.
    s/[\W\d_]+$//;
    $check{$_}++;
  }
  if ($inter) {
    check_words( keys %check );
    %check=();
    print "Word(s): ";
  }
}

unless ( $inter ) {
  check_words( keys %check );
}

exit EX_SUCCESS;  # end of main program, below are subroutines.

sub check_words {

  my @check = sort grep { !exists($words{$_}) } @_ ;
  my @suffs;
  my $temp;

  for my $orig ( @check ) {
    $temp = suffix($orig);
    if ($temp) {
      push @suffs, $temp;
      undef $orig;
    }
  }

  if ($inter) {
    print 'Found: '. join(', ', grep { exists($words{$_}) } @_ );

    if ($suff) { # show suffix changes.
      print "\n\nClose Matches:\n";
      for (@suffs) {
	more( "  $_\n");
      }
    }

    print "\n\nNot Found:\n\n";
    for (@check) {
      next unless $_;
      more( "   $_.\n" );

      if ($check) {
	more( "-----\n" );
	for (&close($_)) {
	  more( "    $_\n");
	}
	more("\n");
      }
    }
  }
  else {

    if ($suff) { # show suffix changes.
      for (@suffs) {
	print "$_\n";
      }
      print "\n-----\n\n";
    }

    for (@check) {
      next unless $_;
      print $_."\n";
      if ($check) {
	print "-----\n  ".join("\n  ",&close($_))."\n\n";
      }
    }
  }

}

sub close {
  my $word = shift;
  my %close;
  my ($first, $last, $mid, $mid2);

  # delete 1 char
  $first=$word;
  $last='';

  while( $mid=chop($first) ){
    if (exists($words{$first.$last})) {
      $close{$first.$last}=1;
    }
    $last=$mid.$last;
  }

  # add 1 char
  $first=$word;
  $last='';

  while ( $mid=chop($first) ) {
    $last = $mid.$last;
    my $temp = $first.'.'.$last;
    @close{ grep {/^$temp$/} @keys } = 1;
  }
  @close{ grep {/^$word/} @keys } = 1;

  # change 1 char
  $first=$word;
  $last='';

  while ( $mid=chop($first) ) {
    my $temp = $first.'.'.$last;
    @close{ grep {/^$temp$/} @keys } = 1;
    $last = $mid.$last;
  }

  # swap 2 chars
  $first=$word;
  $last='';
  $mid=chop($first);

  while ( $mid2=chop($first) ) {
    if ( exists($words{ $first.$mid.$mid2.$last }) ) {
      $close{ $first.$mid.$mid2.$last } =1;
    }
    $last = $mid.$last;
    $mid=$mid2;
  }

  return sort keys %close;
}

{
  my $line;
  sub more {
    if ( ++$line > 20 ) {
      print "----- MORE -----";
      scalar <>;
      print "\b"x16;
      $line=0;
    }
    print $_[0];
  }

}

sub suffix {

# This sub handles suffixes (i.e. the word is not in the dictionary,
# but the version without a suffix is.  It returns a string
# representing the difference between the original word and the
# dictionary word, or undef if no such words are found.

# some improvements needed: move the substitution into the if clause,
# so that the word isn't looked for if $word==$orig, put suffixes in
# an array and do this with a loop instead of separate ifs, optimize
# check order.

    #  with the better dictionary (better in my opinion, yours may
    #  vary), this is mostly commented out and the above fixes realy
    #  not needed.

  my $orig = shift;
  my $word;
  ($word = $orig ) =~ s/'s$//;    #';
  if (exists($words{$word})) {
    return qq("$orig" = "$word"  +  "'s");
  }
  if ( $orig =~ /^(\d+-?)(\w+)$/ && exists($words{$2})) {
    return qq("$orig" = "$1"  +  "$2");
  }
  if ( $orig =~ /^(\w+)-(\w+)$/ && exists($words{$1}) && exists($words{$2}) ) {
    return qq("$orig" = "$1"  +  "-"  +  "$2");
  }

# The suffixes below are from an earlier version, using a smaller
# words file.  If your words file does not contain many of the words
# that end with the below suffixes (but does contain the base), then
# uncomment the appropriate statements.

# ($word = $orig ) =~ s/ing$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "ing");
# }
# ($word = $orig ) =~ s/ing$/e/;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" - "e" + "ing");
# }
# ($word = $orig ) =~ s/ive$/e/;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" - "e" + "ive");
# }
# ($word = $orig ) =~ s/ed$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "ed");
# }
# ($word = $orig ) =~ s/d$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "d");
# }
# ($word = $orig ) =~ s/es$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "es");
# }
# ($word = $orig ) =~ s/s$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "s");
# }
# ($word = $orig ) =~ s/ies$/y/;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" - "y" + "ies");
# }
# ($word = $orig ) =~ s/ly$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "ly");
# }
# ($word = $orig ) =~ s/er$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "er");
# }
# ($word = $orig ) =~ s/or$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "or");
# }
# ($word = $orig ) =~ s/ion$//;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" + "ion");
# }
# ($word = $orig ) =~ s/ion$/e/;
# if (exists($words{$word})) {
#   return qq("$orig" = "$word" - "e" + "ion");
# }

  return undef;

}

__END__

=head1 NAME

spell - scan a file for misspelled words

=head1 SYNOPSIS

spell [-d dict] [-c|-x] [-v] [-i] [+extra_list] [file ...]

=head1 DESCRIPTION

I<spell> reads in a file and splits it into words, then compares each
unique word with a dictionary file (all comparisons and reporting are
done in lower case).  The "misspelled" words are printed on standard
output.

The options are as follows:

=over 4

=item B<-d> I<file>

Use I<file> as the dictionary instead of the default system dictionary.

=item B<-c>

Check the dictionary for "close" matches.
Each misspelled word will be followed by a list of indented close
matches.  A close match is one where it matches a word in the
dictionary if 1 character is deleted, or 1 letter is added, or any 2
adjacent characters are swapped.

=item B<-x>

Same as B<-c> above.

=item B<-v>

Show suffix expansion.  Some words are inferred by modifying
words in the dictionary, this shows you how that was done on some of
your words, so you can double check if it did it right.

=item B<-i>

Use interactively.  Prompts for words, processes words after
each line, instead of after whole file(s), and quit at first blank line
(do not use on a file, this is for typing in words from standard in).
Also prints descriptive titles and has a very basic pager.

=item +extra_list

Add the file I<extra_list> to the list of dictionaries to check against.

=back

Flags may be grouped, e.g. "spell -icv", and can be specified in any order.

If no file is specified, words to check will be read from standard input.
The B<-i> flag should be used if you are going to type words in by hand.

=head1 SEE ALSO

ispell

/usr/dict/words

I did not create a word list, but found one without restriction
(compiled for linux) at: C<ftp://ftp.cs.unc.edu/pub/users/faith/linux/linux.words.2.tar.gz>.

=head1 RESTRICTIONS

This program is only as good as your word list (and maybe not that
good, use the B<-v> switch at least once).  I did not compile the word
list and cannot vouch for it.  Remember this is a tool and not a
replacement for your own thinking.  Anyone who depends wholly on a
spell checker without doing their own proofreading, deserves what they
get (or worse), I accept no liability for any consequences of using
this program.

This program only finds the misspelled words, it is up to you to
decide if they are really wrong and replace them yourself.

My focus in writing this program was portability, therefore I did not
use some things that may have speeded things up, however perl itself
is quick enough that it still does pretty good even with the slower
method.

=head1 BUGS

I know of no bugs at this time.  There are a couple of things that may
make it run a bit faster and the documentation may be made a little clearer.

=head1 AUTHOR

Gregory L. Snow, (Greg), I<snow@biostat.washington.edu>

=head1 COPYRIGHT and LICENSE

This program is copyright (c) Gregory L. Snow 1999.

This program is free and open software.  You may use, modify, or
distribute this program (with or without modifications) to your hearts
content.  However I take no responsibility for your use or misuse of this program.