#!/usr/bin/env perl # PODNAME: split-rates-ali.pl # ABSTRACT: Split ALI files into subsets of sites based on evolutionary rates use Modern::Perl '2011'; use autodie; use Getopt::Euclid qw(:vars); use Smart::Comments; use Bio::MUST::Core; use Bio::MUST::Core::Utils qw(change_suffix secure_outfile); use aliased 'Bio::MUST::Core::Ali'; use aliased 'Bio::MUST::Core::SeqMask'; use aliased 'Bio::MUST::Core::SeqMask::Rates'; # TODO: generalize this to other executables through Utils? my ($load, $store) = qw(load store); my $suffix = '.ali'; my %out_args; if ($ARGV_phylip) { ### Infiles (and outfiles) are in PHYLIP format $load .= '_phylip'; $store .= '_phylip'; $suffix = '.phy'; $out_args{chunk} = -1; } # optionally load other rates file for deltas my $other_rates; $other_rates = Rates->load($ARGV_other_rates) if $ARGV_other_rates; for my $infile (@ARGV_infiles) { ### Processing: $infile my $rates = Rates->load($infile); $infile =~ s/$_//xms for @ARGV_in_strip; my $alifile = change_suffix($infile, $suffix); my $ali = Ali->$load($alifile); $ali->gapify_seqs if $ARGV_from_scafos; ### Infile: "$alifile had " . $ali->width . ' sites' # optionally delete constant sites $ali->apply_mask( SeqMask->variable_mask($ali) ) if $ARGV_del_const; # optionally use delta rates instead of raw rates if available $rates = $rates->delta_rates($other_rates) if $other_rates; ### Computing masks from rates my %args; $args{percentile} = 1 if $ARGV_percentile; $args{cumulative} = 1 if $ARGV_cumulative; $args{descending} = 1 if $ARGV_descending; my @masks = $rates->bin_rates_masks($ARGV_bin_number, \%args); # output one Ali per bin for my $i (0..$#masks) { my $bin_ali = $masks[$i]->filtered_ali($ali); my $outfile = secure_outfile($alifile, $ARGV_out_suffix . "-bin$i"); # Note: $ARGV_out_suffix defaults to q{} for smooth concatenation ### Outfile: "$outfile has " . $bin_ali->width . ' sites' $bin_ali->$store($outfile, \%out_args); } } __END__ =pod =head1 NAME split-rates-ali.pl - Split ALI files into subsets of sites based on evolutionary rates =head1 VERSION version 0.200510 =head1 USAGE split-rates-ali.pl <infiles> [optional arguments] =head1 REQUIRED ARGUMENTS =over =item <infiles> Path to input RATES files [repeatable argument]. =for Euclid: infiles.type: readable repeatable =back =head1 OPTIONAL ARGUMENTS =over =item --in[-strip]=<str> Substring(s) to strip from infile basenames before attempting to derive other infile (e.g., ALI files) and outfile names [default: none]. =for Euclid: str.type: string repeatable =item --out[-suffix]=<suffix> Suffix to append to (possibly stripped) infile basenames for deriving outfile names [default: none]. When not specified, outfile names are taken from infiles but original infiles are preserved by being appended a .bak suffix. =for Euclid: suffix.type: string suffix.default: q{} =item --from-scafos Consider the input ALI file as generated by SCaFoS [default: no]. Currently, specifying this option results in turning all ambiguous and missing character states to gaps. =item --del-const Delete constant sites just as the C<-dc> option of PhyloBayes [default: no]. =item --phylip Assume infiles and outfiles are in PHYLIP format (instead of ALI format) [default: no]. =item --other-rates=<file> Optional additional rates file of the same length as the main infile(s) that will be used to compute rate deltas [default: none]. Currently, rate deltas are defined as the absolute difference between the two rates at each site. This could be improved by, e.g., computing relative deltas. =for Euclid: file.type: readable =item --bin-number=<n> Number of bins to define [default: 10]. =for Euclid: n.type: number n.default: 10 =item --percentile Define bins containing an equal number of sites rather than bins of equal width in terms of rates [default: no]. =item --cumulative Define bins including all previous bins [default: no]. This leads to ALI outfiles of increasing width and only makes sense when slower sites are in lower bins. If higher "rates" mean slower sites, use the --descending option. =item --descending Reverse bin order to accommodate "rates" files where higher values actually correspond to slower sites, such as those produced by Carla Cummins' TIGER [default: no]. =item --version =item --usage =item --help =item --man Print the usual program information =back =head1 AUTHOR Denis BAURAIN <denis.baurain@uliege.be> =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut