#!/usr/bin/perl

use strict;
use warnings;
use 5.010;
use Lingua::NATools::PTD;
use Pod::Usage;
use File::Spec::Functions 'catfile';
use File::Copy 'cp';
use List::MoreUtils 'uniq';
use Term::ReadLine;

=head1 NAME

=encoding UTF-8

nat-ptd - concentrates a set of PTD commands in a common interface

=head1 SYNOPSIS

  nat-ptd [-v] <command> [command-args]

=head1 DESCRIPTION

C<< nat-ptd >> supports the following commands. Most places where a
PTD needs to be specified, you can use a bziped2 PTD as far as the
filename ends in bz2.

=cut

our $opts;
my @args;

our %command =
  (
   query     => [\&query     => 'Allows interactive PTD querying'],
   intersect => [\&intersect => 'Intersects PTD domains, keep lower values'],
   add       => [\&add       => 'Adds/aggregates two/more PTD.'],
   compose   => [\&compose   => 'Compose PTDs into a final PTD.'],
   compare   => [\&compare   => 'Compare two PTDs with some basic stats.'],
   reprob    => [\&reprob    => 'Re-compute PTD probabilities.'],
   lowercase => [\&lowercase => 'Lowercases PTD terms summing probabilities.'],
   filter    => [\&filter    => 'Filters a PTD (or PTD pair).'],
   help      => [\&help      => 'Lists available commands.'],
   stats     => [\&stats     => 'Shows basic PTD statistics.'],
   toDmp     => [\&toDmp     => 'Converts a PTD to Dumper format.'],
   toDmpBz   => [\&toDmpBz   => 'Converts a PTD to Bzipped Dumper format.'],
   toSQLite  => [\&toSQLite  => 'Converts a PTD to SQLite format.'],
   'grep'    => [\&grep      => 'Lists entries matching a specific pattern.'],
  );




($opts, @args) = _process_args(@ARGV);

if (@args) {
    my $command_opts;
    my $command = shift @args;
    ($command_opts, @args) = _process_args(@args);

    $command = "help" unless exists($command{$command});
    $command{$command}[0]->($command_opts, @args);
} else {
    $command{help}[0]->();
}

sub _DEBUG_ {
    $opts->{v} && print STDERR "* ", @_, "\n";
}

=head2 help

The method can be invoked without arguments, and a list of available
commands will be printed.

If an optional parameter with the name of a command is supplied, it
prints detailed help for it (from this man-page).

    nat-ptd help [command-name]

=cut

sub help {
    my ($my_ops, @my_args) = @_;

    if ($my_args[0] && $command{$my_args[0]}) {
        pod2usage( -verbose => 99,
                   -sections => ["DESCRIPTION/$my_args[0]"]);
    } else {
        select STDERR;
        print "nat-ptd: performs various operations over PTD files:\n\n";
        for my $c (sort keys %command) {
            printf("%10s - %s\n", $c, $command{$c}[1]);
        }
        print "\nFor more help, please run 'perldoc nat-ptd'\n";
        select STDOUT;
    }
}

=head2 intersect

Intersects domains from supplied PTDs. Keep lowerer counts and
translation probabilities.

As of recent NATools versions, you can supply an option C<-type> to
specify the type of output file (C<dmp> or C<sqlite> are supported,
and C<dmp> is the default).

=cut

sub intersect {
    my ($my_opts, $ptd, @other) = @_;
    my @cleanup;
    if ($ptd =~ /sqlite$/) {
        cp $ptd => "$ptd._.sqlite";
        $ptd = "$ptd._.sqlite";
        push @cleanup, $ptd;
    }

    my $type    = ($my_opts->{type} && $my_opts->{type} eq "sqlite") ? "sqlite" : "dmp";
    $my_opts->{o} = "intersection.$type" if $type eq "sqlite" && !$my_opts->{o};

    ($ptd && -f $ptd) or die "At least two dictionaries are required";
    _DEBUG_ "loading first PTD [$ptd]";
    $ptd = Lingua::NATools::PTD->new($ptd);

    while (my $other = shift @other) {
        -f $other or die "Can't read PTD file [$other]";
        _DEBUG_ "loading other PTD [$other]";
        $other = Lingua::NATools::PTD->new($other);

        _DEBUG_ "intersecting PTDs";
        $ptd->intersect($other);
    }

    if ($my_opts->{o}) {
        $ptd->saveAs($type => $my_opts->{o});
    } else {
        $ptd->dump;
    }

    unlink $_ for @cleanup;
}

=head2 toSQLite

This option can be used to convert a PTD to the SQLite format. First
argument is the PTD filename. Second, optional, argument can be
specified as the output filename.

=cut

sub toSQLite {
    my ($my_opts, $ptd, $optd) = @_;
    $optd ||= $ptd;
    $optd = _get_output_filename("sqlite", $optd);

    _DEBUG_ "loading PTD [$ptd].";
    $ptd = Lingua::NATools::PTD->new($ptd);

    _DEBUG_ "writing file [$optd].";
    $ptd->saveAs("sqlite", $optd);
}


=head2 toDmp

This option can be used to convert a PTD to the Dumper format. First
argument is the PTD filename. Second, optional, argument can be
specified as the output filename.

=cut

sub _get_output_filename {
    my ($type, $name) = @_;
    return $name if $name =~ /$type$/;
    $name =~ s/\.(dmp|dmp\.bz2|dmp\.xz|sqlite)$/\.$type/;
    $name .= "\.$type" unless $name =~ /$type$/;
    return $name;
}

sub toDmp {
    my ($my_opts, $ptd, $optd) = @_;
    $optd ||= $ptd;
    $optd = _get_output_filename("dmp", $optd);

    _DEBUG_ "loading PTD [$ptd].";
    $ptd = Lingua::NATools::PTD->new($ptd);

    _DEBUG_ "writing file [$optd].";
    $ptd->saveAs("dmp", $optd);
}


=head2 toDmpBz

This option can be used to convert a PTD to a Bzipped Dumper
format. First argument is the PTD filename. Second, optional, argument
can be specified as the output filename.

=cut

sub toDmpBz {
    my ($my_opts, $ptd, $optd) = @_;
    $optd ||= $ptd;
    $optd = _get_output_filename("dmp.bz2", $optd);

    _DEBUG_ "loading PTD [$ptd].";
    $ptd = Lingua::NATools::PTD->new($ptd);

    _DEBUG_ "writing file [$optd].";
    $ptd->saveAs("bz2", $optd);
}

=head2 stats

Prints some basic statistics about a PTD.

=cut

sub stats {
    my ($my_ops, $ptd) = @_;
    _DEBUG_ "loading PTD [$ptd].";
    $ptd = Lingua::NATools::PTD->new($ptd);

    _DEBUG_ "computing statistics.";
    my $stats = $ptd->stats;

    printf "%47s -> %d\n",   "Dictionary size (types)", $stats->{count};
    printf "%47s -> %d\n",   "Dictionary size (tokens)", $stats->{size};
    printf "%47s -> %.2f\n", "Average number of translations", $stats->{avgTransNr};
    printf "%47s -> %d\n",   "Minimum number of occurrences",  $stats->{occMin};
    printf "%47s -> %s\n",   "..for word", $stats->{occMinWord};
    printf "%47s -> %d\n",   "Maximum number of occurrences",  $stats->{occMax};
    printf "%47s -> %s\n",   "..for word", $stats->{occMaxWord};
    printf "%47s -> %.2f\n", "Average number of occurrences", $stats->{avgOcc};
    printf "%47s -> %.2f\n", "Minimum value for best translation probability", $stats->{probMin};
    printf "%47s -> %.2f\n", "Maximum value for best translation probability", $stats->{probMax};
    printf "%47s -> %.2f\n", "Average best translation probability", $stats->{avgBestTrans};
}

=head2 compare

Given two PTD, print some basic statistics comparing their size, domains, etc.

=cut

sub compare {
    my ($my_ops, $ptd1, $ptd2) = @_;

    ($ptd1 && -f $ptd1) or die "Can't read PTD file $ptd1";
    ($ptd2 && -f $ptd2) or die "Can't read PTD file $ptd2";

    my ($n1, $n2) = ($ptd1, $ptd2);

    _DEBUG_ "loading PTD file [$ptd1]";
    $ptd1 = Lingua::NATools::PTD->new($ptd1);

    _DEBUG_ "loading PTD file [$ptd2]";
    $ptd2 = Lingua::NATools::PTD->new($ptd2);

    _DEBUG_ "computing basic stats for first dictionary";
    my $stats1 = $ptd1->stats;
    _DEBUG_ "computing basic stats for second dictionary";
    my $stats2 = $ptd2->stats;

    _DEBUG_ "calculating domain intersections";
    my ($left_not_in_right, $right_not_in_left) = _diff_domains($ptd1, $ptd2);

    _DEBUG_ "calculating translation sets intersections";
    my $data = _translation_sets_stats($ptd1, $ptd2);

    printf "%47s -> %8d | %8d\n",
      "Dictionary size (types)", $stats1->{count}, $stats2->{count};
    printf "%47s -> %8d | %8d\n",
      "Dictionary size (tokens)", $stats1->{size}, $stats2->{size};
    printf "%47s -> %8.2f | %8.2f\n",
      "Average number of translations", $stats1->{avgTransNr}, $stats2->{avgTransNr};
    printf "%47s -> %8d | %8d\n",
      "Minimum number of occurrences",  $stats1->{occMin}, $stats2->{occMin};
    printf "%47s -> %8s | %8s\n",
      "..for words", $stats1->{occMinWord}, $stats2->{occMinWord};
    printf "%47s -> %8d | %8d\n",
      "Maximum number of occurrences",  $stats1->{occMax}, $stats2->{occMax};
    printf "%47s -> %8s | %8s\n",
      "..for words", $stats1->{occMaxWord}, $stats2->{occMaxWord};
    printf "%47s -> %8.2f | %8.2f\n",
      "Average number of occurrences", $stats1->{avgOcc}, $stats2->{avgOcc};
    printf "%47s -> %8.2f | %8.2f\n",
      "Minimum value for best translation probability", $stats1->{probMin}, $stats2->{probMin};
    printf "%47s -> %8.2f | %8.2f\n",
      "Maximum value for best translation probability", $stats1->{probMax}, $stats2->{probMax};
    printf "%47s -> %8.2f | %8.2f\n",
      "Average best translation probability", $stats1->{avgBestTrans}, $stats2->{avgBestTrans};

    printf "%47s -> %8d | %8d\n",
      "Words not present in the other dictionary", $left_not_in_right, $right_not_in_left;

    printf "\n";

    printf "%47s -> %8d (%5.2f%%)\n",
      "Entries sharing translations and order",
        $data->{ordered}, $data->{ordered}/$stats1->{count}*100;
    printf "%47s -> %8d (%5.2f%%)\n",
      "Entries sharing translations",
        $data->{scrambled}, $data->{scrambled}/$stats1->{count}*100;
    printf "%47s -> %8d (%5.2f%%)\n",
      "Entries sharing best translation",
        $data->{shareBest}, $data->{shareBest}/$stats1->{count}*100;

    printf "%47s -> %8d\n",
      "Entries in Dic1 with trans. cont. in Dic2", $data->{firSubset};
    printf "%47s -> %8d\n",
      "Entries in Dic2 with trans. cont. in Dic1", $data->{secSubset};

}


=head2 query

This command allows you to query interactively a PTD.

=cut

sub query {
    my ($my_ops, $filename) = @_;

    ($filename && -f $filename) or die "Can't read PTD file $filename";

    _DEBUG_ "loading PTD file";
    my $ptd = Lingua::NATools::PTD->new($filename);

    _DEBUG_ "initializing readline";
    my $term = Term::ReadLine->new;
    my $prompt = "[$filename] ";
    my $attrs = $term->Attribs;
    $attrs->{completion_entry_function} = $attrs->{list_completion_function};
    $attrs->{completion_word} = [grep { /^[[:alpha:]]+(-[[:alpha:]]+)*$/ } $ptd->words];

    while (defined($_ = $term->readline($prompt))) {
        $term->addhistory($_) if /\S/;
        $_ = _trim($_);
        if ( $_ =~ /\S/) {
            if ($ptd->count($_)) {
                print "\n";
                printf "%s [%d occurrences]\n",  $_, $ptd->count($_);
                my %trans = $ptd->transHash($_);
                for my $k (sort {$trans{$b} <=> $trans{$a}} keys %trans) {
                    printf " - %15s | %7.4f%%\n", $k, $trans{$k}*100;
                }
                print "\n";
            } else {
                print "\n* word [$_] not found.\n\n";
            }
        }
    }


}

=head2 grep

Greps entries matching a specific pattern from a PTD. Supply a pattern
and a PTD file. By default it dumps a subset PTD with entries that
match. With the C<-compact> option it will print a small table with
the entry's best translation.

    nat-ptd grep [-compact] [-o=outfile] <pattern> <ptd-file>

=cut

sub grep {
    my ($my_opts, $pattern, $ptd) = @_;

    if ($my_opts->{o}) {
        open OUT, ">", $my_opts->{o} or die "Can't create file: $!";
        select OUT;
    }

    ($ptd && -f $ptd) or die "Can't find ptd: $ptd";

    _DEBUG_ "loading PTD from $ptd";
    $ptd = Lingua::NATools::PTD->new($ptd);

    # XXX - Colocar codigo em evidencia (apenas uma passagem)

    _DEBUG_ "selecting entries";
    $ptd->downtr(
                 sub {
                     my ($w, $c, %t) = @_;
                     return toentry($w, $c, %t) if $w =~ /$pattern/;
                     return undef;
                 },
                 filter => 1
                );

    _DEBUG_ "dumping PTD";
    if ($my_opts->{compact}) {
        $ptd->downtr(
                     sub {
                         my ($w,$c,%t) = @_;
                         my $x = (sort { $t{$b} <=> $t{$a} } keys %t)[0];
                         printf("%15s (%8d) %s -> %.6f\n", $w, $c, $x, $t{$x});
                     },
                     sorted => 1
                    );
    } else {
        $ptd->dump;
    }

    close OUT if $my_opts->{o};
}

=head2 compose

This method receives a two or more dictionaries.

When receiving a pair of dictionaries (first dictionary target
language should be the same as the second dictionary source language),
composes them, resulting a PTD from first dictionary source language
to second dictionary target language.

This method can be used with more than two dictionaries for a full
transitive dictionary computation.

You can specify the output filename with the C<-o> switch.


As of recent NATools versions, you can supply an option C<-type> to
specify the type of output file (C<dmp> or C<sqlite> are supported,
and C<dmp> is the default).

=cut

sub compose {
    my ($my_opts, @files) = @_;

    -f $files[0] or die "File $files[0] not found or not readable\n";

    my $type    = ($my_opts->{type} && $my_opts->{type} eq "sqlite") ? "sqlite" : "dmp";
    $my_opts->{o} = "composition.$type" if $type eq "sqlite" && !$my_opts->{o};

    _DEBUG_ "loading $files[0] PTD.";
    my $first = Lingua::NATools::PTD->new( shift @files );
    while (@files) {
        -f $files[0] or die "File $files[0] not found or not readable\n";

        _DEBUG_ "loading $files[0] PTD.";
        my $second = Lingua::NATools::PTD->new( shift @files );
        my $new = {};

        _DEBUG_ "composing...";
        for my $word ($first->words) {
            for my $trans ($first->trans($word)) {
                next unless exists $second->{$trans};

                $new->{$word}{count} = $first->count($word);
                for my $ttrans ($second->trans($trans)) {
                    $new->{$word}{trans}{$ttrans} +=
                      $first->prob($word,$trans)*$second->prob($trans,$ttrans);
                }
            }
        }
        $first = bless $new => 'Lingua::NATools::PTD';
    }

    _DEBUG_ "dumping final PTD.";
    if ($my_opts->{o}) {
        $first->saveAs($type => $my_opts->{o});
    } else {
        $first->dump;
    }
}

=head2 filter

This method filters a dictionary (or dictionary pair) accordingly with
some default values (that can be adjusted).

If the supplied name is a directory, it is supposed to be of a NATools
object (a NATools alignment folder). In this case, files C<<
source-target.dmp >> and C<< target-source.dmp >> are searched inside
it.

If the supplied name is not a directory, it is suppoed to be a name of
a PTD dump file. This command will check if it is alone (just a
direction) or if a second filename was supplied. If two were supplied,
they are considered bidirectional (source-target and target-source).

Therefore, three possible usages:

    nat-ptd filter <natools-obj-dir>
    nat-ptd filter <file.dmp>
    nat-ptd filter <file-s-t.dmp> <file-t-s.dmp>

The following switchs can be used:

=over 4

=item C<-numbers>

By default the filtering will remove terms (entries and translations)
with numbers (only numbers, with possible digit separators: space,
comma, point, colon). Use this switch to force them to be B<preserved>.

=item C<-symbols>

Any other term type that is not a standard word (with possible dash or
apostrophe) or a number (as described above), is considered to include
strange symbols, and will be ignored. Use this switch to force them to
be B<preserved>.

=item C<-none>

By default, the 'no translation', also known as 'none', is
removed. You can force it to be B<preserved> with this switch.

=item C<-occs=n>

Defines the minimum occurrence count for entries to be preserved. By
default the used value is 2 (that is, entries with 1 occurrence are
discarded). Use 0 to not discard any entry by occurrence count.

=item C<-prob=p>

Defines the minimum probability for translations to be preserved. By
default the value is 1% (0.01). Define the value as 0 to preserve all
translations.

=item C<-bidir>

Defines if the filtering should check for bidirectional translations,
that is, preserve only terms which translations' translations' include
that term. Mathematically, preserve t if

    t   in   Translations ( Translations ( t ) )

Note that this is only available for NATool objects or dictionary
pairs. By default this switch is B<ON>. Turn it B<OFF> assigning a
B<0> to the switch: C<< -bidir=0 >>

=back

Also, the C<-o> switch can be used to define an output filename. When
using a pair of dictionaries, specify the output filenames separated
by a comma: C<-o=outputfile1,outputfile2>.

As of recent NATools versions, you can supply an option C<-type> to
specify the type of output file (C<dmp> or C<sqlite> are supported,
and C<dmp> is the default).

=cut

sub filter {
    my ($my_opts, $ptd1, $ptd2) = @_;
    my @cleanup;
    my $bidir = 0;

    if ($ptd2) {
        $bidir = 1;
    } elsif (-d $ptd1) {
        ($ptd1,$ptd2) = (catfile($ptd1,'source-target.dmp'),
                         catfile($ptd1,'target-source.dmp'));
    }

    $bidir = 0 if defined $my_opts->{bidir} and $my_opts->{bidir} == 0;

    my $numbers = $my_opts->{numbers} ? 1 : 0;
    my $symbols = $my_opts->{symbols} ? 1 : 0;
    my $nones   = $my_opts->{none}    ? 1 : 0;
    my $minocc  = $my_opts->{occs}    // 2;
    my $minprob = $my_opts->{prob}    // 0.01;
    my $type    = ($my_opts->{type} && $my_opts->{type} eq "sqlite") ? "sqlite" : "dmp";

    ($ptd1  && -f $ptd1) or die "Can't find ptd: $ptd1";
    (!$ptd2 || -f $ptd2) or die "Can't find ptd: $ptd2";

    # check output filenames
    my ($out1, $out2);
    if ($my_opts->{o}) {
        if ($ptd1 and $ptd2 and $my_opts->{o} =~ /,/) {
            ($out1, $out2) = split /,/,$my_opts->{o}
        } elsif ($ptd1 and $ptd2) {
            ($out1, $out2) = ($my_opts->{o}."-st.$type",$my_opts->{o}."-ts.$type");
        } else {
            ($out1, $out2) = ($my_opts->{o}, undef)
        }
    } else {
        if ($ptd1 and $ptd2) {
            ($out1, $out2) = ($ptd1."-filtered.$type",$ptd2."-filtered.$type")
        } else {
            ($out1, $out2) = ($ptd1."-filtered.$type",undef)
        }
    }

    if ($ptd1 =~ /sqlite$/) {
        cp $ptd1 => "$ptd1._.sqlite";
        $ptd1 = "$ptd1._.sqlite";
        push @cleanup, $ptd1;
    }
    if ($ptd2 && $ptd1 =~ /sqlite$/) {
        cp $ptd2 => "$ptd2._.sqlite";
        $ptd2 = "$ptd2._.sqlite";
        push @cleanup, $ptd2;
    }

    # load PTDs
    _DEBUG_ "loading PTD from $ptd1";
    $ptd1 = Lingua::NATools::PTD->new($ptd1);
    if ($ptd2) {
        _DEBUG_ "loading PTD from $ptd2";
        $ptd2 = Lingua::NATools::PTD->new($ptd2);
    }

    my $num_re  = qr/^\d+([.,;: ]\d+)*$/;
    my $sym_re  = qr/([=|.,;:()\[\]{}!\@#"?Â»Â«\$><%^&*\\\/0-9â€“â€œâ€žÂ°Â§]|^-|-$)/;
    my $none_re = qr/^\(none\)$/;
    my $clean_entry = sub {
        my ($w, $c, %t) = @_;

        return undef if $c < $minocc;
        return undef if !$nones   and $w =~ $none_re;
        return undef if !$numbers and $w =~ $num_re;
        return undef if !$symbols and $w !~ $num_re and $w =~ $sym_re;

        for my $t (keys %t) {
            delete $t{$t} if !$nones   and $t =~ $none_re;
            delete $t{$t} if !$numbers and $t{$t} and $t =~ $num_re;
            delete $t{$t} if !$symbols and $t{$t} and $t !~ $num_re and $t =~ $sym_re;
            delete $t{$t} if $t{$t}    and $t{$t} < $minprob;
        }
        return (%t)?toentry($w, $c, %t):undef;
    };
    our $other;
    my $clear_non_transitive = sub {
        my ($w, $c, %t) = @_;
        for my $t (keys %t) {
            delete $t{$t} unless exists $other->{$t} and exists $other->{$t}{trans}{$w};
        }
        return (%t)?toentry($w, $c, %t):undef;
    };

    # filter 1
    _DEBUG_ "filtering PTD.";
    $ptd1->downtr( $clean_entry, filter => 1, verbose => $opts->{v} );
    if ($ptd2) {
        _DEBUG_ "filtering second PTD.";
        $ptd2->downtr( $clean_entry, filter => 1, verbose => $opts->{v} );
    }

    #filter 2
    if ($bidir) {
        _DEBUG_ "removing non-transitive translations.";

        $other = $ptd2;
        $ptd1->downtr( $clear_non_transitive, filter => 1, 
                                              verbose => $opts->{v});

        $other = $ptd1;
        $ptd2->downtr( $clear_non_transitive, filter => 1, 
                                              verbose => $opts->{v});
    }


    _DEBUG_ "writing ptd.";
    $ptd1->saveAs($type, $out1);

    if ($ptd2) {
        _DEBUG_ "writing second ptd.";
        $ptd2->saveAs($type, $out2);
    }

    unlink $_ for @cleanup;
}

=head2 lowercase

This method recompute the probabilities for a dictionary, lowercasing
all terms, and summing up occurrences, and recomputing probabilities.

    nat-ptd lowercase [-o=outputfile] <ptd-filename>

As of recent NATools versions, you can supply an option C<-type> to
specify the type of output file (C<dmp> or C<sqlite> are supported,
and C<dmp> is the default).

=cut

sub lowercase {
    my ($my_opts, $ptd, @my_args) = @_;
    my @cleanup;
    my $oname = $ptd;
    ($ptd && -f $ptd) or die "Can't find ptd: $ptd";
    if ($ptd =~ /sqlite$/) {
        cp $ptd => "$ptd._.sqlite";
        $ptd = "$ptd._.sqlite";
        push @cleanup, $ptd;
    }

    my $type    = ($my_opts->{type} && $my_opts->{type} eq "sqlite") ? "sqlite" : "dmp";
    $my_opts->{o} = "$oname-lowercase.$type" if $type eq "sqlite" && !$my_opts->{o};

    _DEBUG_ "loading PTD from $ptd";
    $ptd = Lingua::NATools::PTD->new($ptd);
    _DEBUG_ "recomputing dictionary";
    $ptd->lowercase(verbose => $opts->{v});

    _DEBUG_ "dumping PTD";
    if ($my_opts->{o}) {
        $ptd->saveAs($type => $my_opts->{o});
    } else {
        $ptd->dump;
    }
    unlink $_ for @cleanup;
}


=head2 reprob

This method recompute the probabilities from a dictionary. It sums up
all possible translations probabilities, consider that total to be
100% (1), and recomputes each probability accordingly.

It takes a required argument, the name of the PTD dump
file. Optionally, you can supply an output file with the C<< -o >>
switch.

    nat-ptd reprob [-o=outputfile] <ptd-filename>

As of recent NATools versions, you can supply an option C<-type> to
specify the type of output file (C<dmp> or C<sqlite> are supported,
and C<dmp> is the default).

=cut

sub reprob {
    my ($my_opts, $ptd, @my_args) = @_;
    my @cleanup;

    ($ptd && -f $ptd) or die "Can't find ptd: $ptd";
    if ($ptd =~ /sqlite$/) {
        cp $ptd => "$ptd._.sqlite";
        $ptd = "$ptd._.sqlite";
        push @cleanup, $ptd;
    }

    my $type    = ($my_opts->{type} && $my_opts->{type} eq "sqlite") ? "sqlite" : "dmp";
    $my_opts->{o} = "reprob.$type" if $type eq "sqlite" && !$my_opts->{o};

    _DEBUG_ "loading PTD from $ptd";
    $ptd = Lingua::NATools::PTD->new($ptd);
    _DEBUG_ "recomputing probabilities";
    $ptd->reprob;

    _DEBUG_ "dumping PTD";
    if ($my_opts->{o}) {
        $ptd->saveAs($type => $my_opts->{o});
    } else {
        $ptd->dump;
    }
    unlink $_ for @cleanup;
}


=head2 add

Adds two or more PTD files into a single PTD file. They should have
the same source and target language. You can use the C<-o> switch to
specify an output filename.

As of recent NATools versions, you can supply an option C<-type> to
specify the type of output file (C<dmp> or C<sqlite> are supported,
and C<dmp> is the default).

=cut

sub add {
    my ($my_opts, $fileA, $fileB, @more) = @_;

    my $type    = ($my_opts->{type} && $my_opts->{type} eq "sqlite") ? "sqlite" : "dmp";
    $my_opts->{o} = "sum.$type" if $type eq "sqlite" && !$my_opts->{o};

    my @cleanup;
    ($fileA && -f $fileA) or die "PTD $fileA not found or not readable\n";
    if ($fileA =~ /sqlite$/) {
        cp $fileA => "$fileA._.sqlite";
        $fileA = "$fileA._.sqlite";
        push @cleanup, $fileA;
    }

    ($fileB && -f $fileB) or die "PTD $fileB not found or not readable\n";

    _DEBUG_ "loading $fileA PTD.";
    $fileA = Lingua::NATools::PTD->new($fileA, verbose => $opts->{v});

    _DEBUG_ "loading $fileB PTD.";
    $fileB = Lingua::NATools::PTD->new($fileB);

    _DEBUG_ "adding PTDs.";
    $fileA->add($fileB);

    while (@more) {
        $fileB = shift @more;
        _DEBUG_ "loading $fileB PTD.";
        $fileB = Lingua::NATools::PTD->new($fileB);
        _DEBUG_ "adding PTDs.";
        $fileA->add($fileB);
    }

    _DEBUG_ "dumping final PTD.";
    if ($my_opts->{o}) {
        $fileA->saveAs($type => $my_opts->{o});
    } else {
        $fileA->dump;
    }
    unlink $_ for @cleanup;
}


sub _process_args {
    my $mopts = {};
    while (@_ && $_[0] =~ /^-/) {
        my $element = shift @_;
        if ($element =~ /^-([^=]+)=(.+)$/) {
            $mopts->{$1} = $2;
        } else {
            $element =~ /^-(.+)$/;
            $mopts->{$1} = 1;
        }
    }
    return ($mopts, @_);
}
sub _trim {
    my $f = shift;
    chomp($f);
    $f =~ s/^\s*//;
    $f =~ s/\s*$//;
    return $f;
}


sub _diff_domains {
    my ($l_dict, $r_dict) = @_;

    my ($l_count, $r_count) = (0, 0);

    $l_dict->downtr(
                    sub {
                        my ($w, $c, %t) = @_;
                        $l_count++ unless exists($r_dict->{$w});
                    } );
    $r_dict->downtr(
                    sub {
                        my ($w, $c, %t) = @_;
                        $r_count++ unless exists($l_dict->{$w});
                    } );

    return ($l_count, $r_count);
}

sub _translation_sets_stats {
    my ($dic1, $dic2) = @_;
    my $data = {};
    my @words = uniq($dic1->words, $dic2->words);
    for my $word (@words) {
        next unless exists $dic1->{$word};
        next unless exists $dic2->{$word};

        $data->{ordered}++   if _equalTranslationSeqs($dic1->{$word}, $dic2->{$word});
        $data->{scrambled}++ if _equalTranslationSets($dic1->{$word}, $dic2->{$word});
        $data->{shareBest}++ if _equalBestTranslation($dic1->{$word}, $dic2->{$word});

        $data->{secSubset}++ if _subset($dic2->{$word}, $dic1->{$word});
        $data->{firSubset}++ if _subset($dic1->{$word}, $dic2->{$word});
    }
    return $data;
}

sub _equalTranslationSeqs {
    my ($E1,$E2) = @_;

    my @T1 = sort {$E1->{trans}{$b}<=>$E1->{trans}{$a}} keys %{$E1->{trans}};
    my @T2 = sort {$E2->{trans}{$b}<=>$E2->{trans}{$a}} keys %{$E2->{trans}};

    if (scalar(@T1) == scalar(@T2)) {
        for (1..$#T1) {
            return 0 if $T1[$_] ne $T2[$_];
        }
        return 1;
    } else {
        return 0;
    }
}

sub _equalBestTranslation {
    my ($E1,$E2) = @_;

    my @T1 = sort {$E1->{trans}{$b}<=>$E1->{trans}{$a}} keys %{$E1->{trans}};
    my @T2 = sort {$E2->{trans}{$b}<=>$E2->{trans}{$a}} keys %{$E2->{trans}};

    if (($T1[0]||"") eq ($T2[0]||"")) {
        return 1;
    } else {
        return 0;
    }
}

sub _equalTranslationSets {
    my ($E1,$E2) = @_;

    my @T1 = sort keys %{$E1->{trans}};
    my @T2 = sort keys %{$E2->{trans}};

    if (scalar(@T1) == scalar(@T2)) {
        for (1..$#T1) {
            return 0 if $T1[$_] ne $T2[$_];
        }
        return 1;
    } else {
        return 0;
    }
}

sub _subset {
    my ($E1,$E2) = @_;
    my @T1 = sort keys %{$E1->{trans}};
    my @T2 = sort keys %{$E2->{trans}};

    return __subset(\@T1,\@T2);
}

sub __subset {
    my ($needle,$haystack) = @_;

    my %x = ();
    @x{@$haystack}=@$haystack;
    for (@$needle) {
        return 0 unless exists $x{$_};
    }
    return 1;
}

=head1 SEE ALSO

NATools, perl(1)

=head1 AUTHOR

Alberto Manuel BrandÃ£o SimÃµes, E<lt>ambs@cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2010-2011 by Alberto Manuel BrandÃ£o SimÃµes

=cut