—package
Bio::MUST::Core::SeqMask;
# ABSTRACT: Sequence mask for selecting specific sites
# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
# CONTRIBUTOR: Raphael LEONARD <rleonard@doct.uliege.be>
$Bio::MUST::Core::SeqMask::VERSION
=
'0.250380'
;
use
Moose;
use
namespace::autoclean;
use
autodie;
# use Smart::Comments '###';
use
Carp;
use
File::Basename;
# public array
# Note: mask methods use the 0-based C/Perl array indexing
# ...but block methods encode bounds using the usual 1-based indexing
has
'mask'
=> (
traits
=> [
'Array'
],
is
=>
'ro'
,
isa
=>
'ArrayRef[Bool]'
,
default
=>
sub
{ [] },
writer
=>
'_set_mask'
,
handles
=> {
mask_len
=>
'count'
,
all_states
=>
'elements'
,
add_state
=>
'push'
,
set_state
=>
'set'
,
state_at
=>
'get'
,
},
);
sub
first_site {
my
$self
=
shift
;
return
first_index {
$_
}
$self
->all_states;
}
sub
last_site {
my
$self
=
shift
;
return
last_index {
$_
}
$self
->all_states;
}
sub
count_sites {
my
$self
=
shift
;
my
$count
=
grep
{
$_
}
$self
->all_states;
return
$count
;
}
sub
coverage {
my
$self
=
shift
;
return
$self
->count_sites /
$self
->mask_len;
}
sub
empty_mask {
my
$class
=
shift
;
my
$len
=
shift
// 0;
return
$class
->new(
mask
=> [ (0) x
$len
] );
}
sub
custom_mask {
my
$class
=
shift
;
my
$len
=
shift
;
my
$sites
=
shift
;
my
$mask
=
$class
->empty_mask(
$len
);
$mask
->set_state(
$_
, 1)
for
@{
$sites
};
return
$mask
;
}
sub
mark_block {
## no critic (RequireArgUnpacking)
return
shift
->_change_block(1,
@_
);
}
sub
unmark_block {
## no critic (RequireArgUnpacking)
return
shift
->_change_block(0,
@_
);
}
sub
_change_block {
my
$self
=
shift
;
my
$state
=
shift
;
my
$start
=
shift
;
my
$end
=
shift
;
$self
->set_state(
$_
,
$state
)
for
$start
-1 ..
$end
-1;
return
$self
;
}
sub
mask2blocks {
my
$self
=
shift
;
my
@blocks
;
my
$start
= -1;
my
$site
= 0;
while
(
$site
<
$self
->mask_len) {
# open a new block (if not yet open)
if
(
$self
->state_at(
$site
) ) {
$start
=
$site
+1
if
$start
< 0;
}
# close the current block (if any open)
else
{
if
(
$start
>= 0) {
push
@blocks
, [
$start
,
$site
];
$start
= -1;
}
}
$site
++;
}
# close last block (if still open)
push
@blocks
, [
$start
,
$site
]
if
$start
>= 0;
return
\
@blocks
;
}
sub
blocks2mask {
my
$class
=
shift
;
my
$blocks
=
shift
;
# old code: only handled non-overlapping ordered blocks
# my @mask;
# for my $block ( @{$blocks} ) {
# my ($start, $end) = @{$block};
# push @mask, (0) x ($start-@mask-1); # pad last block
# push @mask, (1) x ($end-$start+1); # mark new block
# }
# return \@mask;
# Note: blocks can now overlap and come in any order
# this allows using SeqMask for computing seq coverages
# setup empty mask going up to max end (second slot)
my
$max
= max
map
{
$_
->[1] } @{
$blocks
};
my
$mask
=
$class
->empty_mask(
$max
);
# mark all states included in each block
$mask
->mark_block( @{
$_
} )
for
@{
$blocks
};
return
$mask
;
}
# Ali-based SeqMask factory methods
sub
neutral_mask {
my
$class
=
shift
;
my
$ali
=
shift
;
my
$width
=
$ali
->width;
return
$class
->new(
mask
=> [ (1) x
$width
] );
}
sub
variable_mask {
my
$class
=
shift
;
my
$ali
=
shift
;
# TODO: profile and optimize as ideal_mask below (if needed)
my
@mask
;
my
$width
=
$ali
->width;
my
$regex
=
$ali
->gapmiss_regex;
# filter out sites with only one state
for
(
my
$site
= 0;
$site
<
$width
;
$site
++) {
my
$state_n
= uniq
# count unique states
grep
{
$_
!~ m/
$regex
/xms }
# which are valid
map
{
$_
->state_at(
$site
) }
# and found at that site
$ali
->all_seqs
# across all seqs
;
push
@mask
,
$state_n
> 1 ? 1 : 0;
}
return
$class
->new(
mask
=> \
@mask
);
}
sub
parsimony_mask {
my
$class
=
shift
;
my
$ali
=
shift
;
# TODO: profile and optimize as ideal_mask below (if needed)
my
@mask
;
my
$width
=
$ali
->width;
my
$regex
=
$ali
->gapmiss_regex;
# filter out sites with only one state
for
(
my
$site
= 0;
$site
<
$width
;
$site
++) {
my
@states
=
# get states
grep
{
$_
!~ m/
$regex
/xms }
# which are valid
map
{
$_
->state_at(
$site
) }
# and found at that site
$ali
->all_seqs;
# across all seqs
;
#### @states
# check whether at least two states are seen at least twice
my
%count_for
;
$count_for
{
$_
}++
for
@states
;
#### %count_for
my
$state_n
=
grep
{
$count_for
{
$_
} >= 2 }
keys
%count_for
;
#### $state_n
push
@mask
,
$state_n
>= 2 ? 1 : 0;
}
return
$class
->new(
mask
=> \
@mask
);
}
sub
ideal_mask {
my
$class
=
shift
;
my
$ali
=
shift
;
my
$max_res
=
shift
// 0;
# defaults to shared gaps only
# convert fractional max_res to conservative integer (if needed)
$max_res
= floor(
$max_res
*
$ali
->count_seqs)
if
0 <
$max_res
&&
$max_res
< 1;
# pick up right regex based on sequence type
my
$regex
=
$ali
->gapmiss_regex;
# determine count of non-gap-nor-missing states for each site
# original version: easy to understand but much too slow on large matrices
# because of unavoidable regex recompilation for each character state
# (lots of calls to CORE::regcomp in output Devel::NYTProf)
# my @counts;
# for my $seq ($ali->all_seqs) {
# my $site = 0;
# $counts[$site++] += ($_ !~ $regex) for $seq->all_states;
# }
# 'all magic comes with a price' version: the /o regex modifier does speed
# up things but compiles the regex once for all, which leads to bugs when
# dealing with both nucleotide and protein alignments in the same run
# my @counts;
# for my $seq ($ali->all_seqs) {
# my $site = 0;
# $counts[$site++] += ($_ !~ m/$regex/o) for $seq->all_states;
# }
# improved version: more convoluted but much faster
# idea: directly search in sequence strings
# algorithm: set maximal count for each site
# then: decrease corrresponding count for each gap-or-missing state
# my @counts = ( $ali->count_seqs ) x $ali->width;
# for my $seq ($ali->all_seqs) {
# my $str = $seq->seq;
# while ($str =~ m/$regex/xmsg) {
# $counts[ pos($str) - 1 ]--;
# }
# }
# parallel version: quite complex and inefficient
# setup threading
# my $thread_n = 2;
# my $pl = Parallel::Loops->new($thread_n);
# my %counts;
# $pl->share( \%counts );
#
# # define sites ranges
# my $site_n = $ali->width;
# my $chunk = ceil( $site_n / $thread_n );
# my @ranges;
# for (my $start = 0; $start < $site_n; $start += $chunk) {
# my $end = min( $start + $chunk, $site_n );
# push @ranges, [ $start..$end-1 ];
# }
#
# # process ranges in parallel
# my @seqs = $ali->all_seqs;
# $pl->foreach( \@ranges, sub {
# for my $seq (@seqs) {
# $counts{$_} += $seq->state_at($_) !~ $regex for @{$_};
# }
# } );
#
# # filter out sites with at most max_res non-gap-nor-missing states
# my @mask = map { $counts{$_} > $max_res ? 1 : 0 }
# sort { $a <=> $b } keys %counts; # watch the sorting cost!
# current version: even more complex but even more faster
# idea: search for (long) gaps instead of individual states
# algorithm: set maximal count for each site
# then: decrease counts for each stretch of gap-or-missing states
my
@counts
= (
$ali
->count_seqs ) x
$ali
->width;
for
my
$seq
(
$ali
->all_seqs) {
my
$str
=
$seq
->seq;
while
(
$str
=~ m/(
$regex
+)/xmsg) {
# look for next gap
my
$end
=
pos
(
$str
) - 1;
# compute gap start
my
$begin
=
$end
-
length
($1) + 1;
# and gap end
$counts
[
$_
]--
for
$begin
..
$end
;
# decrease count over the gap
}
}
# filter out sites with at most max_res non-gap-nor-missing states
my
@mask
=
map
{
$_
>
$max_res
? 1 : 0 }
@counts
;
return
$class
->new(
mask
=> \
@mask
);
}
my
%gb_parms_for
= (
strict
=> [ 0.85, 8, 10,
'N'
],
# emulate original MUST settings
medium
=> [ 0.75, 5, 5,
'H'
],
loose
=> [ 0.55, 5, 5,
'H'
],
);
sub
gblocks_mask {
my
$class
=
shift
;
my
$ali
=
shift
;
my
$mode
=
shift
//
'strict'
;
# check Gblocks settings
if
( !
defined
$gb_parms_for
{
$mode
} ) {
carp
"[BMC] Warning: invalid Gblocks settings: $mode; using strict!"
;
$mode
=
'strict'
;
}
# create temp .fasta infile for Gblocks
my
$infile
=
$ali
->temp_fasta;
# compute Gblocks parms depending on specified mode
my
(
$t
,
$b1
,
$b2
,
$b3
,
$b4
,
$b5
) = (
(
$ali
->is_protein ?
'p'
:
'd'
),
# Sequence Type
int
(
$ali
->count_seqs / 2) + 1,
# Conserved Position
int
(
$ali
->count_seqs *
$gb_parms_for
{
$mode
}[0]),
# Flank Position
@{
$gb_parms_for
{
$mode
} }[1..3],
# other block parms
);
$b2
= max(
$b1
,
$b2
);
# with few seqs 0.55*n can be smaller than n/2+1
# create Gblocks command
my
$cmd
=
"Gblocks $infile -t=$t"
.
" -b1=$b1 -b2=$b2 -b3=$b3 -b4=$b4 -b5=$b5"
.
' -s=n > /dev/null 2> /dev/null'
# minimal output
;
# try to robustly execute Gblocks
my
$ret_code
=
system
( [ 1, 127 ],
$cmd
);
# Gblocks always returns 1?!?
if
(
$ret_code
== 127) {
carp
'[BMC] Warning: cannot execute Gblocks command;'
.
' returning neutral mask!'
;
return
$class
->neutral_mask(
$ali
);
}
# TODO: try to bypass shell (need for absolute path to executable then)
# parse Gblocks output to get blocks
my
$outfile
=
$infile
.
'-gb.htm'
;
open
my
$out
,
'<'
,
$outfile
;
my
@blocks
;
while
(
my
$line
= <
$out
>) {
# only use the 'Flanks:' line
my
(
$flanks
) =
$line
=~ m/\AFlanks:\s+(.*)/xms;
if
(
defined
$flanks
) {
# isolate numbers
chomp
$flanks
;
$flanks
=~ s/[\[\]]//xmsg;
my
@bound_pairs
=
split
/\s+/xms,
$flanks
;
# take number pairs as block bounds
my
$it
= natatime 2,
@bound_pairs
;
while
(
my
@bounds
=
$it
->() ) {
push
@blocks
, \
@bounds
;
}
}
# ...and the 'New number of positions' line
# to check Gblocks didn't shrink the original Ali due to shared gaps
# which would left-shift all blocks bounds!
my
(
$width
) =
$line
=~ m/original\s+(\d+)\s+positions/xms;
if
(
defined
$width
&&
$width
!=
$ali
->width) {
carp
'[BMC] Warning: shared gaps detected; returning neutral mask!'
;
return
$class
->neutral_mask(
$ali
);
}
}
# unlink temp files
file(
$infile
)->remove;
file(
$outfile
)->remove;
# compute SeqMask from Gblocks blocks
return
$class
->blocks2mask( \
@blocks
);
}
# TODO: check interface for BMGE 2.x
my
%bmge_parms_for
= (
strict
=> [ 0.4, 0.05 ],
medium
=> [ 0.5, 0.20 ],
loose
=> [ 0.6, 0.40 ],
);
sub
bmge_mask {
my
$class
=
shift
;
my
$ali
=
shift
;
my
$mode
=
shift
//
'strict'
;
# check BMGE settings
if
( !
defined
$bmge_parms_for
{
$mode
} ) {
carp
"[BMC] Warning: invalid BMGE settings: $mode; using strict!"
;
$mode
=
'strict'
;
}
# create temp .fasta infile for BMGE
my
$infile
=
$ali
->temp_fasta;
# compute BMGE parms depending on specified mode
my
(
$entropy_cutoff
,
$gap_cutoff
) = @{
$bmge_parms_for
{
$mode
} };
my
$coding_type
=
$ali
->is_protein ?
'AA'
:
'DNA'
;
my
$outfile
=
$infile
.
'-bmge.htm'
;
# create BMGE command
my
$cmd
=
"bmge.sh $infile"
.
" $coding_type $entropy_cutoff $gap_cutoff $outfile > /dev/null"
;
# try to robustly execute BMGE
my
$ret_code
=
system
( [ 0, 127 ],
$cmd
);
if
(
$ret_code
== 127) {
carp
'[BMC] Warning: cannot execute BMGE command;'
.
' returning neutral mask!'
;
return
$class
->neutral_mask(
$ali
);
}
# TODO: try to bypass shell (need for absolute path to executable then)
# parse BMGE output to get selected sites
open
my
$out
,
'<'
,
$outfile
;
my
$len
=
$ali
->width;
my
$sites
;
LINE:
while
(
my
$line
= <
$out
>) {
if
(
$line
=~ m/\s+ selected: \s+ ([\ (0-9)]+)/xms) {
$sites
= [
map
{
$_
- 1 }
split
/\s+/xms, $1 ];
last
LINE;
}
}
# unlink temp files
file(
$infile
)->remove;
file(
$outfile
)->remove;
return
$class
->custom_mask(
$len
,
$sites
);
}
# mask manipulation methods
# TODO: improve interface? $mask1 should be $self for and_mask and or_mask
sub
and_mask {
my
$class
=
shift
;
my
$mask1
=
shift
;
my
$mask2
=
shift
;
my
@mask
;
@mask
= pairmap { (
$a
&&
$b
) // 0 } mesh ( @{
$mask1
}, @{
$mask2
} );
return
$class
->new(
mask
=> \
@mask
);
}
sub
or_mask {
my
$class
=
shift
;
my
$mask1
=
shift
;
my
$mask2
=
shift
;
my
@mask
;
@mask
= pairmap { (
$a
||
$b
) // 0 } mesh ( @{
$mask1
}, @{
$mask2
} );
return
$class
->new(
mask
=> \
@mask
);
}
sub
negative_mask {
my
$self
=
shift
;
my
$ali
=
shift
;
# negate all states over the whole Ali width
# Note the '+ 0' to ensure proper numeric context (0 or 1)
my
$width
=
$ali
->width;
my
@mask
=
map
{ ( not
$self
->state_at(
$_
) ) + 0 } 0..
$width
-1;
return
$self
->new(
mask
=> \
@mask
);
}
sub
codon_mask {
## no critic (RequireArgUnpacking)
my
$self
=
shift
;
# the critic stems from @_ below
my
$args
=
shift
// {};
# HashRef (should not be empty...)
my
$frame
=
$args
->{frame} // 1;
# defaults to frame +1
my
$chunk
=
$args
->{chunk} // 3;
# defaults to codon
my
$max
=
$args
->{max} // 1;
my
@states
=
$self
->all_states;
my
$offset
= (
abs
$frame
) - 1;
my
@mask
= ( (0) x
$offset
,
# first skip frame-1 sites
map
{ ( (
$_
>
$max
) + 0 ) x
$chunk
} bundle_by { sum
@_
}
$chunk
,
@states
[
$offset
..
$#states
] # then filter sites by chunks
);
return
$self
->new(
mask
=> [
@mask
[0..
$#states
] ] );
}
# truncate mask to original length
# SeqMask-based Ali factory methods
before
'filtered_ali'
=>
sub
{
## no critic (ProtectPrivateSubs)
return
Bio::MUST::Core::Ali::_premask_check(
@_
[1,0,2] );
# swap args
## use critic
};
sub
filtered_ali {
my
$self
=
shift
;
my
$ali
=
shift
;
my
$list
=
shift
;
# optional: enable masking vs. filtering
# setup filtering or masking
my
$char
=
$list
?
'X'
:
q{}
;
# create new Ali object (extending header comment)
# TODO: allow custom comments
my
$new_ali
= Ali->new(
comments
=> [
$ali
->all_comments,
'built by '
. (
$list
?
'masked_ali'
:
'filtered_ali'
)
],
);
SEQ:
for
my
$seq
(
$ali
->all_seqs) {
# for non-listed Seqs clone Seq to new Ali
# Note: when filtering all Seqs will be affected
if
(
$list
and not
$list
->is_listed(
$seq
->full_id) ) {
$new_ali
->add_seq(
$seq
->clone);
next
SEQ;
}
# for listed Seqs copy marked sites from original Seq...
# ... and either filter or mask others (set them to missing state)
my
$new_seq
;
my
$site
= 0;
for
my
$state
(
$self
->all_states) {
$new_seq
.=
$state
?
$seq
->state_at(
$site
) x
$state
:
$char
;
$site
++;
# x $state is for Weights objects
}
# add Seq to new Ali
$new_ali
->add_seq(
Seq->new(
seq_id
=>
$seq
->full_id,
seq
=>
$new_seq
)
);
}
return
$new_ali
;
}
# I/O methods
sub
load {
my
$class
=
shift
;
my
$infile
=
shift
;
open
my
$in
,
'<'
,
$infile
;
my
$mask
=
$class
->new();
LINE:
while
(
my
$line
= <
$in
>) {
chomp
$line
;
# skip empty lines and process comment lines
next
LINE
if
$line
=~
$EMPTY_LINE
||
$mask
->is_comment(
$line
);
$mask
->add_state(
$line
);
}
return
$mask
;
}
sub
store {
my
$self
=
shift
;
my
$outfile
=
shift
;
open
my
$out
,
'>'
,
$outfile
;
{
$out
}
$self
->header;
say
{
$out
}
join
"\n"
,
$self
->all_states;
return
;
}
# sub load_bor {
# #~ my $self = shift;
# my $class = shift;
# my $bor = shift;
# my $ali = shift;
#
# #~ my $class = 'Bio::MUST::Core::SeqMask';
#
# #~ open my $in, '<', $infile;
# my $bor_file = $bor->stringify;
# #~ tie my @rows, 'Tie::File', $bor_file;
# #~ tie my @rows, 'Tie::File', $in;
# #~ my $delims = $rows[-1];
# #~ ### $delims
# my $delims;
# open my $bof, '<', $bor_file;
# while ( my $i = <$bof> ) { $delims = $i; }
# my @blocks = pairmap { [ $a , $b ] } ( split ' ', $delims );
# #~ ### @blocks
# my $mask = $class->blocks2mask(\@blocks)->mask;
# #~ ### $mask
# my $seq_mask = $class->new( mask => $mask);
# my $negative = $seq_mask->negative_mask($ali);
# return $negative;
# }
sub
store_una {
my
$self
=
shift
;
my
$outfile
=
shift
;
my
$args
=
shift
// {};
# HashRef (should not be empty...)
# TODO: check arguments as there are no default values?
my
$ali
=
$args
->{ali};
my
$id
=
$args
->{id};
# search for non-gap positions in reference seq
my
$seq
=
$ali
->get_seq_with_id(
$id
);
my
@pos_bases
=
grep
{ !(
$seq
->is_gap(
$_
) ) } 0..(
$seq
->seq_len-1);
# delete gap positions in mask
my
@mask_degap
= @{
$self
->mask }[
@pos_bases
];
# build mask and compute blocks
my
$new_mask
=
$self
->new(
mask
=> \
@mask_degap
);
my
$new_blocks
=
$new_mask
->mask2blocks;
open
my
$out
,
'>'
,
$outfile
;
my
$filename
=
$ali
->file;
my
(
$basename
,
$dir
,
$ext
) = fileparse(
$filename
,
qr{\.[^.]*}
xms);
say
{
$out
}
"Unambigous numbering for $basename$ext based on $id"
;
for
my
$new_block
( @{
$new_blocks
} ) {
say
{
$out
}
join
'-'
, @{
$new_block
};
}
return
;
}
sub
load_blocks {
my
$class
=
shift
;
my
$infile
=
shift
;
open
my
$in
,
'<'
,
$infile
;
my
$mask
=
$class
->new();
my
@blocks
;
LINE:
while
(
my
$line
= <
$in
>) {
chomp
$line
;
# skip empty lines and process comment lines
next
LINE
if
$line
=~
$EMPTY_LINE
||
$mask
->is_comment(
$line
);
# process block specifications (one or more by line)
push
@blocks
, pairmap { [
$a
,
$b
] }
split
/\s+/xms,
$line
;
}
# build mask from blocks and replace empty mask
# this is needed to preserve potential comments
$mask
->_set_mask(
$class
->blocks2mask( \
@blocks
)->mask );
return
$mask
;
}
sub
store_blocks {
my
$self
=
shift
;
my
$outfile
=
shift
;
open
my
$out
,
'>'
,
$outfile
;
say
{
$out
}
"# Coordinates of blocks \n#"
;
# compute blocks from mask
my
$blocks
=
$self
->mask2blocks;
for
my
$block
( @{
$blocks
} ) {
say
{
$out
}
join
"\t"
, @{
$block
};
}
return
;
}
__PACKAGE__->meta->make_immutable;
1;
__END__
=pod
=head1 NAME
Bio::MUST::Core::SeqMask - Sequence mask for selecting specific sites
=head1 VERSION
version 0.250380
=head1 SYNOPSIS
# TODO
=head1 DESCRIPTION
# TODO
=head1 METHODS
=head2 first_site
=head2 last_site
=head2 count_sites
=head2 coverage
=head2 empty_mask
=head2 custom_mask
=head2 mark_block
=head2 unmark_block
=head2 mask2blocks
=head2 blocks2mask
=head2 neutral_mask
=head2 variable_mask
=head2 parsimony_mask
=head2 ideal_mask
=head2 gblocks_mask
=head2 bmge_mask
=head2 and_mask
=head2 or_mask
=head2 negative_mask
=head2 codon_mask
=head2 filtered_ali
=head2 load
=head2 store
=head2 load_bor
=head2 store_bor
=head2 load_una
=head2 store_una
=head2 load_blocks
=head2 store_blocks
=head1 AUTHOR
Denis BAURAIN <denis.baurain@uliege.be>
=head1 CONTRIBUTORS
=for stopwords Catherine COLSON Raphael LEONARD
=over 4
=item *
Catherine COLSON <ccolson@doct.uliege.be>
=item *
Raphael LEONARD <rleonard@doct.uliege.be>
=back
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut