#!/usr/bin/env perl
# PODNAME: split-ppred-ali.pl
# ABSTRACT: Split ALI files into subsets of sites based on ppred data

use Modern::Perl '2011';
use autodie;

use Getopt::Euclid qw(:vars);
use Smart::Comments;

use Bio::MUST::Core;
use Bio::MUST::Core::Utils qw(change_suffix secure_outfile);
use aliased 'Bio::MUST::Core::Ali';
use aliased 'Bio::MUST::Core::IdList';
use aliased 'Bio::MUST::Core::SeqMask';
use aliased 'Bio::MUST::Core::SeqMask::Profiles';


# TODO: generalize this to other executables through Utils?
my ($load, $store) = qw(load store);
my %out_args;
if ($ARGV_phylip) {
    ### Infiles (and outfiles) are in PHYLIP format
    $load  .= '_phylip';
    $store .= '_phylip';
    $out_args{chunk} = -1;
}

# setup optional lists
my $sim_list;
   $sim_list = IdList->load($ARGV_sim_seq_list) if $ARGV_sim_seq_list;
my $obs_list;
   $obs_list = IdList->load($ARGV_obs_seq_list) if $ARGV_obs_seq_list;
my $msk_list;
   $msk_list = $obs_list                        if $ARGV_only_mask;

### Processing simulated files: scalar @ARGV_sim_files
my $sim_alis = [ map { Ali->load_phylip($_) } @ARGV_sim_files ];
my $profiles = Profiles->ppred_profiles($sim_alis, { sim_list => $sim_list } );

INFILE:
for my $infile (@ARGV_infiles) {

    ### Processing: $infile
    my $ali = Ali->$load($infile);
    $ali->gapify_seqs if $ARGV_from_scafos;
    ### Infile: "$infile had " . $ali->width . ' sites'

    # optionally delete constant sites
    $ali->apply_mask( SeqMask->variable_mask($ali) ) if $ARGV_del_const;

    ### Computing masks from ppreds
    my $freqs = $profiles->ppred_freqs($ali, {
        obs_list => $obs_list,
        by_seq   => $ARGV_by_seq,
    } );

    if ($ARGV_only_dump_freqs) {
        my $sim_out = change_suffix($infile, $ARGV_out_suffix . '.sim_freqs');
        ### Dumping simulated freqs to: $sim_out
        $profiles->store($sim_out);
        my $obs_out = change_suffix($infile, $ARGV_out_suffix . '.obs_freqs');
        ### Dumping observed freqs to: $obs_out
        $freqs->store($obs_out, { reorder => $ARGV_reorder } );
        next INFILE;
    }

    my %args;
    $args{percentile} = 1 if $ARGV_percentile;
    $args{cumulative} = 1 if $ARGV_cumulative;
    $args{descending} = 1;              # freqs are always descending!

    my @masks = $freqs->bin_freqs_masks($ARGV_bin_number, \%args);

    # output one Ali per bin
    for my $i (0..$#masks) {
        my $bin_ali = $masks[$i]->filtered_ali($ali, $msk_list);
        my $outfile = secure_outfile($infile, $ARGV_out_suffix . "-bin$i");
        # Note: $ARGV_out_suffix defaults to q{} for smooth concatenation

        ### Outfile: "$outfile has " . $bin_ali->width . ' sites'
        $bin_ali->$store($outfile, \%out_args);
    }
}

__END__

=pod

=head1 NAME

split-ppred-ali.pl - Split ALI files into subsets of sites based on ppred data

=head1 VERSION

version 0.173620

=head1 SYNOPSIS

    $ split-ppred-ali.pl cpVITRELLA-80x8363.puz --phylip
        --sim-files=`ls cpNOVITRELLA-79x8363-CATGTRG-PP_sample_*.ali`
        --sim-seq-list=sim.idl --obs-seq-list=obs.idl
        --bin-number=10 --percentile --out=-ppred

    $ cat sim.idl
    Karlodiniu

    $ cat obs.idl
    Vitrella_b

    # for testing
    $ perl -Ilib bin/split-ppred-ali.pl test/for-ppred.phy --phylip
        --sim-files=`ls test/ppred-*.phy`
        --sim-seq-list=test/sim.idl --obs-seq-list=test/obs.idl
        --bin-number=10 --percentile --out=-ppred --only-mask

    $ perl -Ilib bin/split-ppred-ali.pl test/for-ppred.phy --phylip
        --sim-files=`ls test/ppred-*.phy` --by-seq --only-dump-freqs

At each site, a profile is computed from the simulated primary sequences for
ids listed in C<--sim-seq-list> and compared to the character state observed
in the sequences listed in C<--obs-seq-list>. A mask corresponding to the
simulated frequencies for the observed states is built and sites are ranked
according to these descending frequencies, which means that highest bins
include sites where the observed state is rarely (or never found) in
simulations. Sites where the state is a gap or is missing always get the
maximum frequency and thus fall in the lowest bins.

=head1 USAGE

    split-ppred-ali.pl <infiles> --simfiles=<files>... [optional arguments]

=head1 REQUIRED ARGUMENTS

=over

=item <infiles>

Path to input ALI files [repeatable argument]. If infiles are not in ALI but
in PHYLIP format, use the C<--phylip> option below.

=for Euclid: infiles.type: readable
    repeatable

=item --sim-files=<files>...

List of paths to simulated input files. These files are assumed to be in
PHYLIP format as they result from PhyloBayes' C<ppred>.

=for Euclid: files.type: readable

=back

=head1 OPTIONAL ARGUMENTS

=over

=item --out[-suffix]=<suffix>

Suffix to append to (possibly stripped) infile basenames for deriving
outfile names [default: none]. When not specified, outfile names are taken
from infiles but original infiles are preserved by being appended a .bak
suffix.

=for Euclid: suffix.type: string
    suffix.default: q{}

=item --sim-seq-list=<file>

Path to IDL file listing the ids of the sequences from which site profiles
will be computed after acquiring simulated input files [default: all seqs].

=item --obs-seq-list=<file>

Path to IDL file listing the ids of the sequences that will give observed
frequencies and thus govern site trimming in infiles [default: all seqs].

=item --by-seq

Enable seq-specific simulated site profiles [default: no]. When not specified,
average site profiles are computed from simulated input files.

=item --from-scafos

Consider the input ALI file as generated by SCaFoS [default: no]. Currently,
specifying this option results in turning all ambiguous and missing character
states to gaps.

=item --del-const

Delete constant sites just as the C<-dc> option of PhyloBayes [default: no].

=item --phylip

Assume infiles and outfiles are in PHYLIP format (instead of ALI format)
[default: no].

=item --bin-number=<n>

Number of bins to define [default: 10].

=for Euclid: n.type:    number
    n.default: 10

=item --percentile

Define bins containing an equal number of sites rather than bins of equal
width in terms of observed state frequencies [default: no].

=item --cumulative

Define bins including all previous bins [default: no]. This leads to ALI
outfiles of increasing width where the sequences listed in C<-obs-seq-list>
include ever more character states rarely observed in simulated primary
sequences for ids listed in C<--sim-seq-list>.

=item --only-mask

Mask rarely observed states in sequences listed in C<--obs-seq-list> instead
of removing the corresponding sites from the alignment [default: no].

=item --only-dump-freqs

Output simulated and observed state frequencies instead of producing regular
output files [default: no]. When specified, this option supercedes all those
pertaining to site binning.

=item --reorder

Reorder sequences following descending observed freqs [default: no]. This
option only applies when C<--only-dump-freqs> is specified.

=item --version

=item --usage

=item --help

=item --man

Print the usual program information

=back

=head1 AUTHOR

Denis BAURAIN <denis.baurain@uliege.be>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut