#!/usr/bin/env perl
# PODNAME: split-rates-ali.pl
# ABSTRACT: Split ALI files into subsets of sites based on evolutionary rates

use Modern::Perl '2011';
use autodie;

use Getopt::Euclid qw(:vars);
use Smart::Comments;

use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::SeqMask';
use aliased 'Bio::MUST::Core::SeqMask::Rates';


# TODO: generalize this to other executables through Utils?
my ($load, $store) = qw(load store);
my $suffix = '.ali';
my %out_args;
if ($ARGV_phylip) {
    ### Infiles (and outfiles) are in PHYLIP format
    $load  .= '_phylip';
    $store .= '_phylip';
    $suffix = '.phy';
    $out_args{chunk} = -1;
}

# optionally load other rates file for deltas
my $other_rates;
   $other_rates = Rates->load($ARGV_other_rates) if $ARGV_other_rates;

for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $rates = Rates->load($infile);

    $infile =~ s/$_//xms for @ARGV_in_strip;
    my $alifile = change_suffix($infile, $suffix);
    my $ali = Ali->$load($alifile);
    $ali->gapify_seqs if $ARGV_from_scafos;
    ### Infile: "$alifile had " . $ali->width . ' sites'

    # optionally delete constant sites
    $ali->apply_mask( SeqMask->variable_mask($ali) ) if $ARGV_del_const;

    # optionally use delta rates instead of raw rates if available
    $rates = $rates->delta_rates($other_rates) if $other_rates;

    ### Computing masks from rates
    my %args;
    $args{percentile} = 1 if $ARGV_percentile;
    $args{cumulative} = 1 if $ARGV_cumulative;
    $args{descending} = 1 if $ARGV_descending;
    my @masks = $rates->bin_rates_masks($ARGV_bin_number, \%args);

    # output one Ali per bin
    for my $i (0..$#masks) {
        my $bin_ali = $masks[$i]->filtered_ali($ali);
        my $outfile = secure_outfile($alifile, $ARGV_out_suffix . "-bin$i");
        # Note: $ARGV_out_suffix defaults to q{} for smooth concatenation

        ### Outfile: "$outfile has " . $bin_ali->width . ' sites'
        $bin_ali->$store($outfile, \%out_args);
    }
}

__END__

=pod

=head1 NAME

split-rates-ali.pl - Split ALI files into subsets of sites based on evolutionary rates

=head1 VERSION

version 0.200510

=head1 USAGE

    split-rates-ali.pl <infiles> [optional arguments]

=head1 REQUIRED ARGUMENTS

=over

=item <infiles>

Path to input RATES files [repeatable argument].

=for Euclid: infiles.type: readable
    repeatable

=back

=head1 OPTIONAL ARGUMENTS

=over

=item --in[-strip]=<str>

Substring(s) to strip from infile basenames before attempting to derive other
infile (e.g., ALI files) and outfile names [default: none].

=for Euclid: str.type: string
    repeatable

=item --out[-suffix]=<suffix>

Suffix to append to (possibly stripped) infile basenames for deriving
outfile names [default: none]. When not specified, outfile names are taken
from infiles but original infiles are preserved by being appended a .bak
suffix.

=for Euclid: suffix.type: string
    suffix.default: q{}

=item --from-scafos

Consider the input ALI file as generated by SCaFoS [default: no]. Currently,
specifying this option results in turning all ambiguous and missing character
states to gaps.

=item --del-const

Delete constant sites just as the C<-dc> option of PhyloBayes [default: no].

=item --phylip

Assume infiles and outfiles are in PHYLIP format (instead of ALI format)
[default: no].

=item --other-rates=<file>

Optional additional rates file of the same length as the main infile(s) that
will be used to compute rate deltas [default: none]. Currently, rate deltas
are defined as the absolute difference between the two rates at each site.
This could be improved by, e.g., computing relative deltas.

=for Euclid: file.type: readable

=item --bin-number=<n>

Number of bins to define [default: 10].

=for Euclid: n.type:    number
    n.default: 10

=item --percentile

Define bins containing an equal number of sites rather than bins of equal width
in terms of rates [default: no].

=item --cumulative

Define bins including all previous bins [default: no]. This leads to ALI
outfiles of increasing width and only makes sense when slower sites are in
lower bins. If higher "rates" mean slower sites, use the --descending option.

=item --descending

Reverse bin order to accommodate "rates" files where higher values actually
correspond to slower sites, such as those produced by Carla Cummins' TIGER
[default: no].

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

=back

=head1 AUTHOR

Denis BAURAIN <denis.baurain@uliege.be>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut