package Lingua::Jspell::DictManager; use 5.006; use strict; use warnings; use Data::Dumper; use File::Copy; use YAML 'LoadFile'; use File::Spec::Functions; use Lingua::Jspell; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw() ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw( &toword &install_dic ); our $VERSION = '0.01'; # Preloaded methods go here. sub install_dic{ my %opt =(yaml => undef, name=>undef); ## irr => "name.irr" if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my ($aff,@dic)=@_; my $cpaff=1; if($aff =~ /^from:(.*)/){ $aff = catfile($Lingua::Jspell::JSPELLLIB,"$1.aff") ; $cpaff=0;} my $ya; open(F,">__$$.dic") or die("Error 1: $!\n"); for (@dic){open(G, $_) or die("Error 2($_): $!\n"); print F ; close G; } close F; $ya = LoadFile($opt{yaml}) if $opt{yaml}; my $name = $opt{name} || $ya->{META}{IDS}[0] || $dic[0]; if($opt{hash}){ copy($opt{hash}, "__$$.hash"); } else { system ("jbuild __$$.dic $aff __$$.hash"); } if($opt{irr}){ copy($opt{irr},catfile($Lingua::Jspell::JSPELLLIB,$opt{irr})) or warn ("Error 3: $!"); } copy("__$$.hash",catfile($Lingua::Jspell::JSPELLLIB,"$name.hash")) or warn ("Error 4: $!"); if($cpaff){ copy($aff, catfile($Lingua::Jspell::JSPELLLIB,"$name.aff")) or warn ("Error 5: $!"); } if ($opt{yaml}){ copy($opt{yaml}, catfile($Lingua::Jspell::JSPELLLIB,"$name.yaml")) or warn ("Error 6: $!"); for(@{$ya->{META}{IDS}}){ copy("__$$.hash",catfile($Lingua::Jspell::JSPELLLIB,"$_.hash")) or warn ("Error 7: $!"); } } unlink("__$$.dic","__$$.hash","__$$.dic.cnt","__$$.dic.stat"); } sub init{ my $file = shift; my $self = { filename => $file }; open F, $file or die "Cannot open file '$file': $!\n"; while() { $self->{ shortcut}{$1} = $2 if (m!^#([^/]+)/([^/]+)/!); $self->{revshortcut}{$2} = $1 if (m!^#([^/]+)/([^/]+)/!); } close F; copy($file,"$file.old") or die("$! cant create $file.old\n"); return bless($self); } sub toword{ _data2line(@_) } sub modeach_word{ my %opt =(rawfea => 0); my $dic = shift; if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my $func = shift; open DIC, $dic->{filename} or die("cannot open file"); open NDIC, ">$dic->{filename}.new" or die("cannot create file $!"); while() { if (m!^#! or m!^\s*$!){ print NDIC $_ ; next } chomp; my ($word,$class,$flags,@r) = split '/', $_; my @flags = ($flags)?split(//, $flags):(); if(not $opt{rawfea}){ my @atts; if ($class =~ /^\$/){ @atts = (special => $class)} else { $class =~ s/#([A-Za-z][A-Za-z0-9]*)/$dic->{shortcut}{$1} || ""/ge if $class; @atts = ($class)?split(/[,=]/, $class):(); } my %atts; if (@atts % 2) { %atts = (); } else { %atts = @atts; } print NDIC $func->($word,\%atts,\@flags,@r) || $_; } else { print NDIC $func->($word,$class,\@flags,@r) || $_; } print NDIC "\n"; } close DIC; close NDIC; copy("$dic->{filename}.new",$dic->{filename}); } sub foreach_word { my %opt =(type => "struct"); my $dic = shift; if(ref($_[0]) eq "HASH") {%opt = (%opt , %{shift(@_)}) } ; my $func = shift; open DIC, $dic->{filename} or die("cannot open file"); while() { next if m!^#!; next if m!^\s*$!; chomp; my ($word,$class,$flags,@r) = split '/', $_; if($opt{type} eq "struct"){ $class =~ s/#([A-Za-z][A-Za-z0-9]*)/$dic->{shortcut}{$1} || ""/ge if $class; my @flags = ($flags)?split(//, $flags):(); my @atts = ($class)?split(/[,=]/, $class):(); my %atts; if (@atts % 2) { %atts = (); } else { %atts = @atts; } $func->($word,\%atts,\@flags,@r); } elsif( $opt{type} eq "raw"){ $func->($_); } } close DIC; } sub for_this_cat_I_want_only_these_flags { my $dic = shift; my $cat = shift; $cat =~ s/#//g; my $flags = shift; my %flags; @flags{split //,$flags}=1; foreach_word($dic, sub { my ($w,$a,$f) = @_; my %fs = %flags; my $ct = $cat; my $this_cat = $a->{CAT} || "unknown"; if ($this_cat eq $ct) { my $fl; for $fl(@$f) { unless (exists($fs{$fl})) { print "$w from category '$cat' uses flag '$fl'\n"; } } } }); } sub for_this_cat_I_dont_want_these_flags { my $dic = shift; my $cat = shift; $cat =~ s/#//g; my $flags = shift; my %flags; @flags{split //,$flags}=1; foreach_word($dic, sub { my ($w,$a,$f) = @_; my %fs = %flags; my $ct = $cat; my $this_cat = $a->{CAT} || "unknown"; if ($this_cat eq $ct) { my $fl; for $fl(@$f) { if (exists($fs{$fl})) { print "$w from category '$cat' uses flag '$fl'\n"; } } } }); } sub not_categorized { my $dic = shift; open DIC, $dic->{filename} or die("Cannot open file"); while() { chomp; next if /^#/; next if /^\s*$/; m{^([^/]+)/}; my $word = $1; my $cat = $'; next unless ($cat =~ m!^/!); print "word '$word' doesn't have a categorie\n"; } close DIC; } sub extra_words { my $dic = shift; my %from; my ($r,$word,$fea,$fea1,$t); my $jdic = Lingua::Jspell->new("port"); open DIC, $dic->{filename} or die("Cannot open file"); while() { chomp; next if /^#/; next if /^\s*$/; m{^([^/]+)/}; $word = $1; my @rads = $jdic->rad($word); if (@rads > 1) { print STDERR "." if rand > 0.99; for $r (@rads) { next if ($r eq $word); # for the fea from $word, get the rad==$r for $fea ($jdic->fea($word)) { if ($fea->{rad} eq $word) { for $fea1 (fea($r)) { if (_same_cat($fea1->{CAT},$fea->{CAT})) { $from{$r} = {word=>$word, orig=>$fea1, dest=>$fea}; } } } } # $from{$r} = {word=>$word}; } } } close DIC; for (keys %from) { if ($from{$from{$_}{word}}{word}) { print "# warning: multiple dependence\n"; print "# \t$_ => $from{$_}{word} => $from{$from{$_}{word}}{word}\n"; delete($from{$_}); } else { print "# from $_ you can get $from{$_}{word}\n"; print "delete_word('$from{$_}{word}')\n"; } } } sub _same_cat { my ($a,$b) = @_; if (defined($a) && defined($b)) { return ($a eq $b); } else { return 0; } } # Each element should be a reference to an associative array like this: # # { word => 'word', flags => 'zbr', CAT => 'np', G=>'f' } sub add_word { my $dict = shift; $dict->_add_full_line(map { my $word = $_->{word}; my $flags = $_->{flags}; my $comment = $_->{comment} || ""; delete($_->{word}); delete($_->{flags}); delete($_->{comment}); my %hash = %$_; my $info = join(",",map {"$_=$hash{$_}"} keys %hash); "$word/$info/$flags/$comment" } @_); } sub _add_full_line { my $dict = shift; my %saw =(); @saw{@_} = (); my @v; open DIC, $dict->{filename} or die("cannot open dictionary file"); open NDIC, ">$dict->{filename}.new" or die("cannot open new dictionary file"); while () { push @v,$_ and next if (/^#/); chomp; $saw{$_} = 1; } print NDIC join "", @v; print NDIC "\n\n"; print NDIC map {/./ ? ("$_\n"):()} sort keys %saw; close DIC; close NDIC; copy("$dict->{filename}.new",$dict->{filename}); } sub delete_word { my $dict = shift; my $pal=shift; my $t; open DIC, $dict->{filename} or die("cannot open dictionary file"); open NDIC, ">$dict->{filename}.new" or die("cannot open new dictionary file"); while () { $t = $1 if /^(.+?)\//; print NDIC unless ($t=~/^$pal$/); } close DIC; close NDIC; copy("$dict->{filename}.new",$dict->{filename}); } sub add_flag { my $dic = shift; my $flag = shift; my %words; @words{@_} = 1; $dic -> foreach_word( sub { my ($w,$a,$f) = @_; my %fs; @fs{@$f}=1; if ($words{$w}) { @fs{split //, $flag}=1;; print _data2line($w,$a,join("",keys %fs)); } print _data2line($w,$a,$f); }); } #$pal=shift; #($ac,$flag)=(shift=~/([\+\-])(.)/); # #while (<>) { #print $_ and next if ($_=~/^#/ || $_ eq "\n"); #$_=~s#\n#/\n# unless ($_=~/.*\/.*\/.*\//); #($a,$b,$c,$d)=($_=~/^(.+?)\/(.*?)\/(.*?)\/(.*)/); #$c=~s#$flag##g if ($a=~/^$pal$/); #$c.=$flag if ($a=~/^$pal$/ && $ac eq "+"); #print "$a/$b/$c/$d\n"; #} sub _data2line { my ($word,$atts,$flags,@r) = @_; if(ref $atts){ return "$word/". join(",",map { "$_=$atts->{$_}" } keys %$atts). "/". join("",grep {/./} @$flags). "/". join("/",@r); } else { return "$word/". $atts . "/". join("",grep {/./} @$flags). "/". join("/",@r); } } =head1 NAME Lingua::Jspell::DictManager - a perl module for processing jspell dictionaries =head1 SYNOPSIS use Lingua::Jspell::DictManager; $dict = init("dictionary file"); $dict->foreach_word( \&func ); $dict->for_this_cat_I_want_only_these_flags('nc', 'fp'); $dict->add_flag("p","linha","carro",...); $dict->add_word({word=>'word',flags=>'zbr',CAT=>'np',G=>'f'},...) remflag("f.dic","p","linha","carro",...); =head1 DESCRIPTION =head2 C This function returns a new dictionary object to be used in future methods. It requires a string with the dictionary file name. =head2 C install_dic({name=>"teste"} ,"t.aff", "t.dic") install_dic({name=>"t"} ,"from:port", "t1.dic", "t2.dic") install_dic({yaml=>"t.yaml"} ,"from:port", "t1.dic", "t2.dic") install_dic({yaml=>"t.yaml",irr=>"f.irr"} ,"from:port", "t1.dic") C is used to reuse the affix table from language C (the file lang.aff is imported from the jspell library directory. (see jspell-dic -dir) name -- name of the dictionary yaml -- yaml file with metadata irr -- file with irregular terms =head2 C This method processes all words from the dictionary using the function passed as argument. This function is called with three arguments: the word, a reference to an associative array with the category information and a reference to a list of rules identifiers. =head2 C This method processes all words from the dictionary using the function passed as argument. This function is called with three arguments: the word, a reference to an associative array with the category information and a reference to a list of rules identifiers. If the option C<< rawfea =>1 >> is selected, modeach_word receives a string instead of a hash reference. modeach_word({rawfea=>1}, sub { my($w,$cat,$flags,@com)=@_; ... }) Use the function C to rebuild a new value; if "" is return, the previous value is kept. =head2 C This method receives a gramatical category and a string with flags. It will print warning messages for each entry with that category and with a flag not described in the flags string. =head2 C Works like the previous method, but will print warnings if any category uses one of the specificed flags. =head2 C This method returns a report for the entries without a category definition. =head2 C This method tries to find redundant entries on the dictionary, producing an ouput file to be executed and delete the redundancy. =head2 C Add (one or more) word to the dictionary $dict->add_word({word=>'word',flags=>'zbr',CAT=>'np',G=>'f'},...) =head2 C Deletes the word passed as argument. =head2 C Adds the flags in the first argument to all words passed. =head2 C to format Word, features, flags and commants to jspell-dict format. This functions is tically used em C. =head1 AUTHOR Alberto Simoes, Ealbie@alfarrabio.di.uminho.ptE J.Joao Almeida, Ejj@di.uminho.ptE =head1 SEE ALSO Lingua::Jspell(3), jspell(1) =head1 COPYRIGHT & LICENSE Copyright 2007-2009 Projecto Natura This program is free software; licensed under GPL. =cut 1; __END__