package Lingua::Jspell;

use warnings;
use strict;

use POSIX qw(locale_h);
setlocale(LC_CTYPE, "pt_PT");
use locale;

use base 'Exporter';
our @EXPORT_OK = (qw.onethat verif nlgrep setstopwords ok any2str hash2str.);
our %EXPORT_TAGS = (basic => [qw.onethat verif ok any2str hash2str.],
                    greps => [qw.nlgrep setstopwords.]);

use File::Which qw/which/;
use IPC::Open3;

=head1 NAME

Lingua::Jspell - Perl interface to the Jspell morphological analyser.

=cut

our $VERSION = '1.50_05';
our $JSPELL;
our $JSPELLLIB;
our $MODE = { nm => "af", flags => 0 };
our $DELIM = '===';
our %STOP =();

BEGIN {
  # Search for jspell binary.
  $JSPELL = which("jspell");
  my $JSPELLDICT = which("jspell-dict");
  if (!$JSPELL) {
	# check if we are running under make test
	$JSPELL = "blib/script/jspell";
	$JSPELLDICT = "blib/script/jspell-dict";
	$JSPELL = undef unless -e $JSPELL;
  }
  die "jspell binary cannot be found!\n" unless -e $JSPELL;

  chomp($JSPELLLIB = `$JSPELLDICT --dic-dir`);
}

=head1 SYNOPSIS

    use Lingua::Jspell;

    my $dic = Lingua::Jspell->new( "dict_name");
    my $dic = Lingua::Jspell->new( "dict_name" , "personal_dict_name");

    $dict->rad("gatinho");      # list of radicals (gato)

    $dict->fea("gatinho");      # list of possible analysis

    $dict->der("gato");         # list of derivated words

    $dict->flags("gato");       # list of roots and flags

=head1 FUNCTIONS

=head2 new

Use to open a dictionary. Pass it the dictionary name and optionally a
personal dictionary name. A new jspell dictionary object will be
returned.

=cut

sub new {
  my ($self, $dr, $pers, $flag);
  local $/="\n";
  my $class = shift;

  $self->{dictionary} = shift;
  $self->{pdictionary} = shift ||
    (defined($ENV{HOME})?"$ENV{HOME}/.jspell.$self->{dictionary}":"");

  $pers = $self->{pdictionary}?"-p $self->{pdictionary}":"";
  $flag = defined($self->{'undef'})?$self->{'undef'}:"-y";

  ## Get meta info
  my $meta_file = _meta_file($self->{dictionary});
  if (-f $meta_file) {
    open META, $meta_file or die "$!";
    while(<META>) {
      next if m!^\s*$!;
      next if m!^\s*#!;
      s!#.*$!!;
      if (m!^(\w+):\s*(.*)!) {
        $self->{meta}{_}{$1} = $2;
      }
      if (m!^(\w+)=(\w+):\s*(.*)!) {
        $self->{meta}{$1}{$2} = $3;
      }
    }
    close META;
  } else {
    $self->{meta} = {};
  }

  $self->{pid} = open3($self->{DW},$self->{DR},$self->{DE},
		       "$JSPELL -d $self->{dictionary} -a $pers -W 0 $flag -o'%s!%s:%s:%s:%s'") ||
			 die "Cannot find 'jspell'";
  binmode($self->{DW},":bytes");
  binmode($self->{DR},":bytes");
  $dr = $self->{DR};
  my $first_line = <$dr>;

  $self->{mode} ||= $MODE;
  my $dw = $self->{DW};
  print $dw _mode($self->{mode});

  if ($first_line  =~ /Jspell/) { return bless $self, $class }  #amen
  else                          { return undef}
}

=head2 setmode

=cut

sub setmode {
  my ($self, $mode) = @_;

  my $dw = $self->{DW};
  if (defined($mode)) {
    $self->{mode} = $mode;
    print $dw _mode($mode);
  } else {
    return $self->{mode}
  }
}

=head2 fea

Returns a list of analisys of a word. Each analisys is a list of
attribute value pairs. Attributes available: CAT, T, G, N, P, ....

  @l = $dic->fea($word)

=cut


sub fea{
  my ($self,$w) = @_;

  local $/="\n";

  my @r = ();
  my ($a, $rad, $cla, $flags);

  return () if $w =~ /\!/;

  my ($dw,$dr) = ($self->{DW},$self->{DR});

  print $dw " $w\n";
  $a = <$dr>;

  for (;($a ne "\n"); $a=<$dr>) {       # l^e as respostas
    for($a){
      chop;
      my ($lixo,$clas);
      if(/(.*?) :(.*)/){$clas = $2 ; $lixo =$1}
      else             {$clas = $_ ; $lixo =""}

      for(split(/[,;] /,$clas)){
        ($rad,$cla)= m{(.+?)\!:*(.*)$};

	# Não sei porquê, mas acontece por vezes de $cla ser 'undef'
	# Não sei bem o que devemos fazer... de momento, estou simplesmente
	# a passar o código à frente.
	if ($cla) {
	  if ($cla =~ s/\/(.*)$//) { $flags = $1 }
	  else                     { $flags = "" }

	  $cla =~ s/:+$//g;
	  $cla =~ s/:+/,/g;

	  my %ana;
	  my @attrs = split /,/, $cla;
	  for (@attrs) {
	    if (m!=!) {
	      $ana{$`}=$';
	    } else {
	      print STDERR "** WARNING: Feature-structure parse error: $cla (for word '$w')\n";
	    }
	  }

	  $ana{"flags"} = $flags if $flags;

	  if ($lixo =~ /^&/) {
	    $rad =~ s/(.*?)= //;
	    $ana{"guess"} = lc($1);
	    $ana{"unknown"} = 1;
	  }
	  if ($rad ne "" ) {
	    push(@r,+{"rad" => $rad, %ana});
	  }
	}
      }
    }
  }
  return @r;
}

=head2 flags

=cut

sub flags {
  my $self = shift;
  my $w = shift;
  my ($a,$dr);
  local $/="\n";

  print {$self->{DW}} "\$\"$w\n";
  $dr = $self->{DR};
  $a = <$dr>;

  chop $a;
  return split(/[# ,]+/,$a);
}

=head2 rad

Returns the list of all possible radicals/lemmas for the supplied word.

  @l = $dic->rad($word)

=cut

sub rad {
  my $self = shift;
  my $word = shift;

  return () if $word =~ /\!/;

  my %rad = ();
  my $a_ = "";
  local $/ = "\n";

  my ($dw,$dr) = ($self->{DW},$self->{DR});

  print $dw " $word\n";

  for ($a_ = <$dr>; $a_ ne "\n"; $a_ = <$dr>) {
    chop $a_;
    %rad = ($a_ =~ m/(?: |:)([^ =:,!]+)(\!)/g ) ;
  }

  return (keys %rad);
}


=head2 der

Returns the list of all possible words using the word as radical.

  @l = $dic->der($word);

=cut

sub der {
  my ($self, $w) = @_;
  my @der = $self->flags($w);
  my %res = ();
  my $command;

  $command = sprintf("echo '%s'|$JSPELL -d $self->{dictionary} -e -o '' ",join("\n",@der));

  local $/ = "\n";

  for (`$command`) {
    chop;
    s/(=|, | $)//g;
    for(split) { $res{$_}++; }
  }

  my $irrcomm;

  # This need to be tested
  my $irr_file = _irr_file($self->{dictionary});
  $irrcomm = sprintf("grep '^%s=' $irr_file",$w);

  for (`$irrcomm`){
    chop;
    for (split(/[= ]+/,$_)) { $res{$_}++; }
  }

  return keys %res;
}

=head2 onethat

Returns the first Feature Structure from the supplied list that
verifies the Feature Structure Pattern used.

   $analysis = onethat( { CAT=>'adj' }, @features);

=cut

sub onethat {
  my ($a, @b) = @_;
  for (@b) {
    return %$_ if verif($a,$_);
  }
  return () ;
}

=head2 verif

Retuurns a true value if the second Feature Structure verifies the
first Feature Structure Pattern.

   if (verif( $pattern, $feature) )  { ... }

=cut

sub verif {
  my ($a, $b) = @_;
  for (keys %$a) {
    return 0 if (!defined($b->{$_}) || $a->{$_} ne $b->{$_}); 
  }
  return 1;
}

=head2 nlgrep

=cut

sub nlgrep {
  # max=int, sep:str, radtxt:bool
  my %opt = (max=>10000, sep => "\n",radtxt=>0);
  %opt = (%opt,%{shift(@_)}) if ref($_[0]) eq "HASH";

  my $p = shift;

  my $pattern = $opt{radtxt} ? $p : join("|",(der($p)));
  my $p2 = qr/\b(?:$pattern)\b/i;

  my @file_list=@_;
  local $/=$opt{sep};

  my @res=();
  my $n = 0;
  for(@file_list) {
    open(F,$_) or die("cant open $_\n");
    while(<F>) {
      # if(/\b(?:$pattern)\b/io){}
      if (/$p2/) {
        chomp;
        s/$DELIM.*//g if $opt{radtxt};
        push(@res,$_);
        last if $n++ == $opt{max};
      }
    }
    close F;
    last if $n == $opt{max};
  }
  return @res;
}

=head2 setstopwords

=cut

sub setstopwords {
  $STOP{$_} = 1 for @_;
}

=head2 cat2small

Note: This function is specific for the Portuguese jspell dictionary

=cut

# NOTA: Esta funcao é específica da língua TUGA!
sub _cat2small {
  my %b = @_;

  if ($b{'CAT'} eq 'art') {
    # Artigos: o léxico já prevê todos...
    # por isso, NUNCA SE DEVE CHEGAR AQUI!!!
    return "ART";
    # 16 tags

  } elsif ($b{'CAT'} eq 'card') {
    # Numerais cardinais:
    return "DNCNP";
    # o léxico já prevê os que flectem (1 e 2); o resto é tudo neutro plural.

  } elsif ($b{'CAT'} eq 'nord') {
    # Numerais ordinais:
    return "\UDNO$b{'G'}$b{'N'}";

  } elsif ($b{'CAT'} eq 'ppes' || $b{'CAT'} eq 'prel' ||
           $b{'CAT'} eq 'ppos' || $b{'CAT'} eq 'pdem' ||
           $b{'CAT'} eq 'pind' || $b{'CAT'} eq 'pint') {
    # Pronomes:
    if ($b{'CAT'} eq 'ppes') {
      # Pronomes pessoais
      $b{'CAT'} = 'PS';
    } elsif ($b{'CAT'} eq 'prel') {
      # Pronomes relativos
      $b{'CAT'} = 'PR';
    } elsif ($b{'CAT'} eq 'ppos') {
      # Pronomes possessivos
      $b{'CAT'} = 'PP';
    } elsif ($b{'CAT'} eq 'pdem') {
      # Pronomes demonstrativos
      $b{'CAT'} = 'PD';
    } elsif ($b{'CAT'} eq 'pint') {
      # Pronomes interrogativos
      $b{'CAT'} = 'PI';
    } elsif ($b{'CAT'} eq 'pind') {
      # Pronomes indefinidos
      $b{'CAT'} = 'PF';
    }

    $b{'G'} = 'N' if $b{'G'} eq '_';
    $b{'N'} = 'N' if $b{'N'} eq '_';

    return "\U$b{'CAT'}$b{'C'}$b{'G'}$b{'P'}$b{'N'}";
    #                        $b{'C'}: caso latino.

  } elsif ($b{'CAT'} eq 'nc') {
    # Nomes comuns:
    $b{'G'} = 'N' if $b{'G'} eq '_' || $b{'G'} eq '';
    $b{'N'} = 'N' if $b{'N'} eq '_' || $b{'N'} eq '';
    return "\U$b{'CAT'}$b{'G'}$b{'N'}";

  } elsif ($b{'CAT'} eq 'np') {
    # Nomes próprios:
    $b{'G'} = 'N' if $b{'G'} eq '_' || $b{'G'} eq '';
    $b{'N'} = 'N' if $b{'N'} eq '_' || $b{'N'} eq '';
    return "\U$b{'CAT'}$b{'G'}$b{'N'}";

  } elsif ($b{'CAT'} eq 'adj') {
    # Adjectivos:
    $b{'G'} = 'N' if $b{'G'} eq '_';
    $b{'G'} = 'N' if $b{'G'} eq '2';
    $b{'N'} = 'N' if $b{'N'} eq '_';
    #    elsif ($b{'N'} eq ''){
    #      $b{'N'} = 'N';
    #    }
    return "\UJ$b{'G'}$b{'N'}";

  } elsif ($b{'CAT'} eq 'a_nc') {
    # Adjectivos que podem funcionar como nomes comuns:
    $b{'G'} = 'N' if $b{'G'} eq '_';
    $b{'G'} = 'N' if $b{'G'} eq '2';
    $b{'N'} = 'N' if $b{'N'} eq '_';
    #    elsif ($b{'N'} eq ''){
    #      $b{'N'} = 'N';
    #    }
    return "\UX$b{'G'}$b{'N'}";

  } elsif ($b{'CAT'} eq 'v') {
    # Verbos:

    # formas nominais:
    if ($b{'T'} eq 'inf') {
      # infinitivo impessoal
      $b{'T'} = 'N';

    } elsif ($b{'T'} eq 'ppa') {
      # Particípio Passado
      $b{'T'} = 'PP';

    } elsif ($b{'T'} eq 'g') {
      # Gerúndio
      $b{'T'} = 'G';

    } elsif ($b{'T'} eq 'p') {
      # modo indicativo: presente (Hoje)
      $b{'T'} = 'IH';

    } elsif ($b{'T'} eq 'pp') {
      # modo indicativo: pretérito Perfeito
      $b{'T'} = 'IP';

    } elsif ($b{'T'} eq 'pi') {
      # modo indicativo: pretérito Imperfeito
      $b{'T'} = 'II';

    } elsif ($b{'T'} eq 'pmp') {
      # modo indicativo: pretérito Mais-que-perfeito
      $b{'T'} = 'IM';

    } elsif ($b{'T'} eq 'f') {
      # modo indicativo: Futuro
      $b{'T'} = 'IF';

    } elsif ($b{'T'} eq 'pc') {
      # modo conjuntivo (Se): presente (Hoje)
      $b{'T'} = 'SH';

    } elsif ($b{'T'} eq 'pic') {
      # modo conjuntivo (Se): pretérito Imperfeito
      $b{'T'} = 'SI';

    } elsif ($b{'T'} eq 'fc') {
      # modo conjuntivo (Se): Futuro
      $b{'T'} = 'PI';

    } elsif ($b{'T'} eq 'i') {
      # modo Imperativo: presente (Hoje)
      $b{'T'} = 'MH';

    } elsif ($b{'T'} eq 'c') {
      # modo Condicional: presente (Hoje)
      $b{'T'} = 'CH';

    } elsif ($b{'T'} eq 'ip') {
      # modo Infinitivo (Pessoal ou Presente): 
      $b{'T'} = 'PI';

      # Futuro conjuntivo? Só se tiver um "se" antes! -> regras sintácticas...
      # modo&tempo não previstos ainda...

    } else {
      $b{'T'} = '_UNKNOWN';
    }

    # converter 'P=1_3' em 'P=_': provisório(?)!
    $b{'P'} = '_' if $b{'P'} eq '1_3'; # único sítio com '_' como rhs!!!

    return "\U$b{'CAT'}$b{'T'}$b{'G'}$b{'P'}$b{'N'}";
    #                               Género, só para VPP.
    # +/- 70 tags

  } elsif ($b{'CAT'} eq 'prep') {
    # Preposições¹:
    return "\UP";

  } elsif ($b{'CAT'} eq 'adv') {
    # Advérbios²:
    return "\UADV";

  } elsif ($b{'CAT'} eq 'con') {
    # Conjunções²:
    return "\UC";

  } elsif ($b{'CAT'} eq 'in') {
    # Interjeições¹:
    return "\UI";

    # ¹: não sei se a tag devia ser tão atómica, mas para já não há confusão!

  } elsif ($b{'CAT'} =~ m/^cp(.*)/) {
    # Contracções¹:
    $b{'G'} = 'N' if $b{'G'} eq '_';
    $b{'N'} = 'N' if $b{'N'} eq '_';
    return "\U&$b{'G'}$b{'N'}";

    # ²: falta estruturar estes no próprio dicionário...
    # Palavras do dicionário com categoria vazia ou sem categoria,
    # palavras não existentes ou sequências aleatórias de caracteres:

  } elsif ($b{'CAT'} eq '') {
    return "\UUNDEFINED";

  } else {   # restantes categorias (...?)
    return "\UUNTREATED";
  }
}

=head2 featags

=cut

sub featags{
  my ($self, $palavra) = @_;
  return (map {_cat2small(%$_)} ($self->fea($palavra)));
}

=head2 ok

 # ok: cond:fs x ele:fs-set -> bool
 # exist x in ele : verif(cond , x)

=cut

sub ok {
  my ($a, @b) = @_;
  for (@b) {
    return 1 if verif($a,$_);
  }
  return 0 ;
}

=head2 mkradtxt

=cut

sub mkradtxt {
  my ($self, $f1, $f2) = @_;
  open F1, $f1 or die "Can't open '$f1'\n";
  open F2, "> $f2" or die "Can't create '$f2'\n";
  while(<F1>) {
    chomp;
    print F2 "$_$DELIM";
    while (/((\w|-)+)/g) {
      print F2 " ",join(" ",$self->rad($1)) unless $STOP{$1}
    }
    print F2 "\n";
  }
  close F1;
  close F2;
}

=head2 any2str

=cut

sub any2str {
  my ($r, $i) = @_;
  $i ||= 0;
  if ($i eq "compact") {
    if (ref($r) eq "HASH") {
      return "{". hash2str($r,$i) . "}"
    } elsif (ref($r) eq "ARRAY") {
      return "[" . join(",", map (any2str($_,$i), @$r)) . "]" 
    } else {
      return "$r"
    }
  } else {
    my $ind = ($i >= 0)? (" " x $i) : "";
    if (ref($r) eq "HASH") {
      return "$ind {". hash2str($r,abs($i)+3) . "}"
    } elsif (ref($r) eq "ARRAY") {
      return "$ind [\n" . join("\n", map (any2str($_,abs($i)+3), @$r)) . "]"
    } else {
      return "$ind$r"
    }
  }
}

=head2 hash2str

=cut

sub hash2str {
  my ($r, $i) = @_;
  my $c = "";
  if ($i eq "compact") {
    for (keys %$r) {
      $c .= any2str($_,$i). "=". any2str($r->{$_},$i). ",";
    }
    chop($c);
  } else {
    for (keys %$r) {
      $c .= "\n". any2str($_,$i). " => ". any2str($r->{$_},-$i);
    }
  }
  return $c;
}

=head1 AUTHOR

Jose Joao Almeida, C<< <jj@di.uminho.pt> >>
Alberto Simões, C<< <ambs@di.uminho.pt> >>

=head1 BUGS

Please report any bugs or feature requests to
C<bug-lingua-jspell@rt.cpan.org>, or through the web interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Lingua-Jspell>.  I
will be notified, and then you'll automatically be notified of
progress on your bug as I make changes.

=head1 COPYRIGHT & LICENSE

Copyright 2007-2008 Projecto Natura

This program is free software; licensed undef GPL.

=cut

sub _meta_file {
  my $dic_file = shift;
  if ($dic_file =~ m!\.hash$!) {
    # we have a local dictionary
    $dic_file =~ s/\.hash/.meta/;
  } else {
    $dic_file = "$JSPELLLIB/$dic_file.meta"
  }
  return $dic_file;
}

sub _mode {
  my $m = shift;
  my $r="";
  if ($m->{nm}) {
    if ($m->{nm} eq "af")
      { $r .= "\$G\n\$P\n\$y\n" }
    elsif ($m->{nm} eq "full")
      { $r .= "\$G\n\$Y\n\$m\n" }
    elsif ($m->{nm} eq "cc")
      { $r .= "\$G\n\$P\n\$Y\n" }
    else {}
  }
  if ($m->{flags})          {$r .= "\$z\n"}
  else                      {$r .= "\$Z\n"}
  return $r;
}


sub _irr_file {
  my $irr_file = shift;
  if ($irr_file =~ m!\.hash$!) {
    # we have a local dictionary
    $irr_file =~ s/\.hash/.irr/;
  } else {
    $irr_file = "$JSPELLLIB/$irr_file.irr"
  }
  return $irr_file;
}



1; # End of Lingua::Jspell

__END__


# sub nlgrepold {
#   my $proc=shift;
#   my $file_list=join(' ',@_);
#   local $/="\n";

#   open(TMPp,"> $tmp/_jspell$$") || die(" can't open tmp ");
#   for (der($proc)) { print TMPp "$_\n" unless $STOP{$_}; }
#   close(TMPp);

#   my @res=();
#   for (`$agrep -h -i -w -f $tmp/_jspell$$ $file_list`) {
#     push(@res,$_);
#   }
#   unlink "$tmp/_jspell$$";
#   @res;
# }

# sub nlgrepold2 {
#   my $p=shift;
#   my %opt=();           # max=int, sep:str, radtxt:bool
#   if(ref($p) eq "HASH"){
#     %opt=%$p;
#     $p=shift}

#   my $file_list=join(' ',@_);
#   local $/=$opt{sep} || "\n";

#   my $max="";
#   $max = "|head -$opt{max}" if $opt{'max'};
#   my $sep="";
#   $sep = "-d '$opt{sep}' -t " if $opt{sep};

#   unless($opt{radtxt}){
#     open(TMPp,"> $tmp/_jspell$$") || die(" can't open tmp ");
#     for (der($p)) { print TMPp "$_\n" unless $STOP{$_}; }
#     close(TMPp);
#   }

#   my @res=();
#   if(defined $opt{radtxt}){
#     for (`$agrep -h -i -w '$p' $file_list  $max`) {
#       chomp;
#       s/$DELIM.*//g;
#       push(@res,$_);
#     } }
#   else{
#     for (`$agrep $sep -h -i -w -f $tmp/_jspell$$ $file_list $max`) {
#       chomp;
#       push(@res,$_);
#     } }
#   unlink "$tmp/_jspell$$" unless $opt{radtxt};
#   @res;
# }

sub nlgrep1{
  my $proc = shift;
  my $file_list = join(' ',@_);
  local $/="\n";

  my @res=();
  for (`$agrep -h -i -w '$proc' $file_list`) {
    if( /(.*?)$DELIM/){ push(@res,$1) };
  }
  @res;
}

sub nlgrep3 {
  my $proc=shift;
  my $qt=shift;
  my $file_list=join(' ',@_);
  local $/="\n";

  open(TMPp,"> $tmp/_jspell$$") || die(" can't open tmp ");
  for (der($proc)) { print TMPp "$_\n" unless $STOP{$_}; }
  close(TMPp);

  my @res=();
  for (`$agrep -h -i -w -f $tmp/_jspell$$ $file_list | head -$qt`) {
    push(@res,$_);
  }
  unlink "$tmp/_jspell$$";
  @res;
}

sub nlgrep2 {
  my $proc=shift;
  my $sep=shift;
  my $file_list=join(' ',@_);
  my $a;

  open(TMPp,"> $tmp/_jspell$$") || die(" can't open tmp\n ");
  for (der($proc)) { print TMPp "$_\n" unless $STOP{$_}; }
  close(TMPp);

  my @res=();
  local $/=$sep;
  open(TMPp,"$agrep -d '$sep' -h -i -w -f $tmp/_jspell$$ $file_list | ") or
    die "cant agrep :-((";

  while ($a=<TMPp>){
    chomp($a);
    push(@res,$a);
  }
  close(TMPp);
  unlink "$tmp/_jspell$$";
  @res;
}

# Esta funcao precisa de ser re-escrita para tirar partido dos
# ficheiros .meta
sub show_fea {
  my $struct = shift;
  for (keys %$struct) {

    if (/^N$/) {
      print "Number: ",(($struct->{$_} eq "p")?"plural":"singular"),"\n";
      next;
    }

    if (/^G$/) {
      print "Genre: ",(($struct->{$_} eq "m")?"masculine":"feminine"),"\n";
      next;
    }

    if (/^CAT$/) {
      my %significado = (
			 nc => 'common name',
			 adj => 'adjective',
			 a_nc => 'common_name / adjective',
			 adv => 'adverb',
			 prep => 'preposition',
			 in => '??',
			 v => 'verb',
			 pind => '??',
			 con => '??',
			 cp => '??',
			);
      print "Categorie: ",$significado{$struct->{$_}},"\n";
      next;
    }

    print "$_ => $struct->{$_}\n";
  }
}