————————————#
# BioPerl module for Bio::AlignIO::phylip
#
# Copyright Heikki Lehvaslaiho
#
=head1 NAME
Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
=head1 SYNOPSIS
Do not use this module directly. Use it via the Bio::AlignIO class.
This example shows how to write to phylip format:
use Bio::AlignIO;
use Bio::SimpleAlign;
# Use -idlength to set the name length to something other than
# the default 10 if you need longer ids.
my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
-fh => \*STDOUT,
-idlength => 30);
# Convert data from one format to another
my $gcgstream = Bio::AlignIO->new(-format => 'msf',
-file => 't/data/cysprot1a.msf');
while( my $aln = $gcgstream->next_aln ) {
$phylipstream->write_aln($aln);
}
# With phylip sequential format format
$phylipstream->interleaved(0);
# Or initialize the object like this
$phylipstream = Bio::AlignIO->new(-interleaved => 0,
-format => 'phylip',
-fh => \*STDOUT,
-idlength => 20 );
$gcgstream = Bio::AlignIO->new(-format => 'msf',
-file => 't/data/cysprot1a.msf');
while( my $aln = $gcgstream->next_aln ) {
$phylipstream->write_aln($aln);
}
This example shows how to read phylip format:
my $in = Bio::AlignIO->new(
-file => $inFile,
-format => 'phylip',
-interleaved => 0,
-longid => 1
);
my $out = Bio::AlignIO->new(
-file => ">$outFile",
-format => 'fasta'
);
while ( my $aln = $in->next_aln() ) {
$out->write_aln($aln);
}
The -longid argument is required if the input phylip format file
has ids with lengths greater then 10 characters.
=head1 DESCRIPTION
This object can transform Bio::SimpleAlign objects to and from PHYLIP
format. By default it works with the interleaved format. By specifying
the flag -interleaved =E<gt> 0 in the initialization the module can
read or write data in sequential format.
Reading phylip format with long IDs up to 50 characters is supported by
the flag -longid =E<gt>1. ID strings can be surrounded by single quotes.
They are mandatory only if the IDs contain spaces.
=head1 FEEDBACK
=head2 Support
Please direct usage questions or support issues to the mailing list:
I<bioperl-l@bioperl.org>
rather than to the module maintainer directly. Many experienced and
reponsive experts will be able look at the problem and quickly
address it. Please include a thorough description of the problem
with code and data examples if at all possible.
=head2 Reporting Bugs
Report bugs to the Bioperl bug tracking system to help us keep track
the bugs and their resolution. Bug reports can be submitted via the
web:
=head1 AUTHORS - Heikki Lehvaslaiho and Jason Stajich
Email: heikki at ebi.ac.uk
Email: jason at bioperl.org
=head1 APPENDIX
The rest of the documentation details each of the object
methods. Internal methods are usually preceded with a _
=cut
# Let the code begin...
package
Bio::AlignIO::phylip;
$Bio::AlignIO::phylip::VERSION
=
'1.7.8'
;
use
strict;
use
Bio::SimpleAlign;
BEGIN {
$DEFAULTIDLENGTH
= 10;
$DEFAULTLINELEN
= 60;
$DEFAULTTAGLEN
= 10;
}
=head2 new
Title : new
Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
-file => '>file',
-idlength => 10,
-idlinebreak => 1);
Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
Returns : L<Bio::AlignIO> object
Args : [specific for writing of phylip format files]
-idlength => integer - length of the id (will pad w/
spaces if needed) when writing phylip
-interleaved => boolean - whether interleaved
or sequential format required
-line_length => integer of how long a sequence lines should be
-idlinebreak => insert a line break after the sequence id
so that sequence starts on the next line
-flag_SI => whether or not write a "S" or "I" just after
the num.seq. and line len., in the first line
-tag_length => integer of how long the tags have to be in
each line between the space separator. set it
to 0 to have 1 tag only.
-wrap_sequential => boolean for whether or not sequential
format should be broken up or a single line
default is false (single line)
-longid => boolean to read arbitrary long IDs (default is false)
=cut
sub
_initialize {
my
(
$self
,
@args
) =
@_
;
$self
->SUPER::_initialize(
@args
);
my
(
$interleave
,
$linelen
,
$idlinebreak
,
$idlength
,
$flag_SI
,
$tag_length
,
$ws
,
$longid
)
=
$self
->_rearrange(
[
qw(INTERLEAVED
LINE_LENGTH
IDLINEBREAK
IDLENGTH
FLAG_SI
TAG_LENGTH
WRAP_SEQUENTIAL
LONGID)
],
@args
);
$self
->interleaved(
$interleave
? 1 : 0 )
if
defined
$interleave
;
$self
->idlength(
$idlength
||
$DEFAULTIDLENGTH
);
$self
->id_linebreak(1)
if
(
$idlinebreak
);
$self
->line_length(
$linelen
)
if
defined
$linelen
&&
$linelen
> 0;
$self
->flag_SI(1)
if
(
$flag_SI
);
$self
->tag_length(
$tag_length
)
if
(
$tag_length
||
$DEFAULTTAGLEN
);
$self
->wrap_sequential(
$ws
? 1 : 0 );
$self
->longid(
$longid
? 1 : 0 );
1;
}
=head2 next_aln
Title : next_aln
Usage : $aln = $stream->next_aln()
Function: returns the next alignment in the stream.
Throws an exception if trying to read in PHYLIP
sequential format.
Returns : L<Bio::SimpleAlign> object
Args :
=cut
sub
next_aln {
my
$self
=
shift
;
my
$entry
;
my
(
$seqcount
,
$residuecount
,
%hash
,
$name
,
$str
,
@names
,
$seqname
,
$start
,
$end
,
$count
,
$seq
);
my
$aln
= Bio::SimpleAlign->new(
-source
=>
'phylip'
);
# First, parse up through the header.
# If we see a non-blank line that isn't the seqcount and residuecount line
# then bail out of next_aln (return)
while
(
$entry
=
$self
->_readline ) {
if
(
$entry
=~ /^\s?$/ ) {
next
;
}
elsif
(
$entry
=~ /\s*(\d+)\s+(\d+)/ ) {
(
$seqcount
,
$residuecount
) = ( $1, $2 );
last
;
}
else
{
$self
->
warn
(
"Failed to parse PHYLIP: Did not see a sequence count and residue count."
);
return
;
}
}
return
unless
$seqcount
and
$residuecount
;
# First alignment section. We expect to see a name and (part of) a sequence.
my
$idlen
=
$self
->idlength;
$count
= 0;
while
(
$entry
=
$self
->_readline ) {
if
(
$entry
=~ /^\s?$/ ) {
# eat the newlines
next
;
}
# Names can be in a few different formats:
# 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
# 2. they can be hacked with a long ID, as passed in with the flag -longid.
# 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
if
(
$self
->longid() ) {
# 2 or 3
if
(
$entry
=~ /^
'(.+)'
\s+(.+)$/ ) {
# 3. name has single quotes.
$name
= $1;
$str
= $2;
}
else
{
# 2. name does not have single quotes, so should not have spaces.
# therefore, the first part of the line is the name and the rest is the seq.
# make sure that the line does not lead with extra spaces.
$entry
=~ s/^\s+//;
(
$name
,
$str
) =
split
( /\s+/,
$entry
, 2 );
}
}
else
{
# 1. traditional phylip.
$entry
=~ /^(.{1,10})\s(.+)$/;
$name
= $1;
$str
= $2;
$name
=~ s/\s+$//;
# eat any trailing spaces
$name
=~ s/\s+/_/g;
}
push
@names
,
$name
;
#clean sequence of spaces:
$str
=~ s/\s+//g;
# are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
if
( (
$self
->interleaved ) == 0 ) {
while
(
length
(
$str
) <
$residuecount
) {
$entry
=
$self
->_readline;
$str
.=
$entry
;
$str
=~ s/\s+//g;
if
(
$entry
=~ /^\s*$/ ) {
# we ran into a newline before we got a complete sequence: bail!
$self
->
warn
(
"Failed to parse PHYLIP: Sequence $name was shorter than expected: "
.
length
(
$str
)
.
" instead of $residuecount."
);
last
;
}
}
}
$hash
{
$count
} =
$str
;
$count
++;
# if we've read as many seqs as we're supposed to, move on.
if
(
$count
==
$seqcount
) {
last
;
}
}
# if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
if
(
$self
->interleaved ) {
while
(
length
(
$hash
{
$seqcount
- 1 } ) <
$residuecount
) {
$count
= 0;
while
(
$entry
=
$self
->_readline ) {
if
(
$entry
=~ /^\s*$/ ) {
# eat newlines
if
(
$count
!= 0 ) {
# there was a newline at an unexpected place!
$self
->
warn
(
"Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount."
);
return
;
}
next
;
}
else
{
# start taking in chunks
$entry
=~ s/\s//g;
$hash
{
$count
} .=
$entry
;
$count
++;
}
if
(
$count
>=
$seqcount
) {
# we've read all of the sequences for this chunk, so move on.
last
;
}
}
}
}
if
( (
scalar
@names
) !=
$seqcount
) {
$self
->
warn
(
"Failed to parse PHYLIP: Did not see the correct number of seqs: saw "
.
scalar
(
@names
)
.
", expected $seqcount."
);
return
;
}
for
(
$count
= 0;
$count
<
$seqcount
;
$count
++ ) {
$str
=
$hash
{
$count
};
my
$seqname
=
$names
[
$count
];
if
(
length
(
$str
) !=
$residuecount
) {
$self
->
warn
(
"Failed to parse PHYLIP: Sequence $seqname was the wrong length: "
.
length
(
$str
)
.
" instead of $residuecount."
);
}
$seq
= Bio::LocatableSeq->new(
'-seq'
=>
$hash
{
$count
},
'-display_id'
=>
$seqname
);
$aln
->add_seq(
$seq
);
}
return
$aln
;
}
=head2 write_aln
Title : write_aln
Usage : $stream->write_aln(@aln)
Function: writes the $aln object into the stream in phylip format
Returns : 1 for success and 0 for error
Args : L<Bio::Align::AlignI> object
=cut
sub
write_aln {
my
(
$self
,
@aln
) =
@_
;
my
$count
= 0;
my
$wrapped
= 0;
my
$maxname
;
my
$width
=
$self
->line_length();
my
(
$length
,
$date
,
$name
,
$seq
,
$miss
,
$pad
,
%hash
,
@arr
,
$tempcount
,
$index
,
$idlength
,
$flag_SI
,
$line_length
,
$tag_length
);
foreach
my
$aln
(
@aln
) {
if
( !
$aln
|| !
$aln
->isa(
'Bio::Align::AlignI'
) ) {
$self
->
warn
(
"Must provide a Bio::Align::AlignI object when calling write_aln"
);
next
;
}
$self
->throw(
"All sequences in the alignment must be the same length"
)
unless
$aln
->is_flush(1);
$flag_SI
=
$self
->flag_SI();
$aln
->set_displayname_flat();
# plain
$length
=
$aln
->
length
();
if
(
$flag_SI
) {
if
(
$self
->interleaved() ) {
$self
->_print(
sprintf
(
" %s %s I\n"
,
$aln
->num_sequences,
$aln
->
length
)
);
}
else
{
$self
->_print(
sprintf
(
" %s %s S\n"
,
$aln
->num_sequences,
$aln
->
length
)
);
}
}
else
{
$self
->_print(
sprintf
(
" %s %s\n"
,
$aln
->num_sequences,
$aln
->
length
) );
}
$idlength
=
$self
->idlength();
$line_length
=
$self
->line_length();
$tag_length
=
$self
->tag_length();
foreach
$seq
(
$aln
->each_seq() ) {
$name
=
$aln
->displayname(
$seq
->get_nse );
if
(
$self
->longid ) {
$self
->
warn
(
"The length of the name is over 50 chars long [$name]"
)
if
length
(
$name
) > 50;
$name
=
"'$name' "
;
}
else
{
$name
=
substr
(
$name
, 0,
$idlength
)
if
length
(
$name
) >
$idlength
;
$name
=
sprintf
(
"%-"
.
$idlength
.
"s"
,
$name
);
if
(
$self
->interleaved() ) {
$name
.=
' '
;
}
elsif
(
$self
->id_linebreak ) {
$name
.=
"\n"
;
}
}
#phylip needs dashes not dots
my
$seq
=
$seq
->seq();
$seq
=~ s/\./-/g;
$hash
{
$name
} =
$seq
;
push
(
@arr
,
$name
);
}
if
(
$self
->interleaved() ) {
my
$numtags
;
if
(
$tag_length
<=
$line_length
) {
$numtags
= floor(
$line_length
/
$tag_length
);
$line_length
=
$tag_length
*
$numtags
;
}
else
{
$numtags
= 1;
}
while
(
$count
<
$length
) {
# there is another block to go!
foreach
$name
(
@arr
) {
my
$dispname
=
$name
;
$dispname
=
''
if
$wrapped
;
$self
->_print(
sprintf
(
"%"
. (
$idlength
+ 3 ) .
"s"
,
$dispname
) );
$tempcount
=
$count
;
$index
= 0;
$self
->debug(
"residue count: $count\n"
)
if
(
$count
% 100000 == 0 );
while
((
$tempcount
+
$tag_length
<
$length
)
&& (
$index
<
$numtags
) ) {
$self
->_print(
sprintf
(
"%s "
,
substr
(
$hash
{
$name
},
$tempcount
,
$tag_length
)
)
);
$tempcount
+=
$tag_length
;
$index
++;
}
# last
if
(
$index
<
$numtags
) {
# space to print!
$self
->_print(
sprintf
(
"%s"
,
substr
(
$hash
{
$name
},
$tempcount
) )
);
$tempcount
+=
$tag_length
;
}
$self
->_print(
"\n"
);
}
$self
->_print(
"\n"
);
$count
=
$tempcount
;
$wrapped
= 1;
}
}
else
{
foreach
$name
(
@arr
) {
my
$dispname
=
$name
;
my
$line
=
sprintf
(
"%s%s\n"
,
$dispname
,
$hash
{
$name
} );
if
(
$self
->wrap_sequential ) {
$line
=~ s/(.{1,
$width
})/$1\n/g;
}
$self
->_print(
$line
);
}
}
}
$self
->flush
if
$self
->_flush_on_write &&
defined
$self
->_fh;
return
1;
}
=head2 interleaved
Title : interleaved
Usage : my $interleaved = $obj->interleaved
Function: Get/Set Interleaved status
Returns : boolean
Args : boolean
=cut
sub
interleaved {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
if
(
$value
) {
$self
->{
'_interleaved'
} = 1
}
else
{
$self
->{
'_interleaved'
} = 0
}
}
return
1
unless
defined
$self
->{
'_interleaved'
};
return
$self
->{
'_interleaved'
};
}
=head2 flag_SI
Title : flag_SI
Usage : my $flag = $obj->flag_SI
Function: Get/Set if the Sequential/Interleaved flag has to be shown
after the number of sequences and sequence length
Example :
Returns : boolean
Args : boolean
=cut
sub
flag_SI {
my
(
$self
,
$value
) =
@_
;
my
$previous
=
$self
->{
'_flag_SI'
};
if
(
defined
$value
) {
$self
->{
'_flag_SI'
} =
$value
;
}
return
$previous
;
}
=head2 idlength
Title : idlength
Usage : my $idlength = $obj->idlength
Function: Get/Set value of id length
Returns : string
Args : string
=cut
sub
idlength {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_idlength'
} =
$value
;
}
return
$self
->{
'_idlength'
};
}
=head2 line_length
Title : line_length
Usage : $obj->line_length($newval)
Function:
Returns : value of line_length
Args : newvalue (optional)
=cut
sub
line_length {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_line_length'
} =
$value
;
}
return
$self
->{
'_line_length'
} ||
$DEFAULTLINELEN
;
}
=head2 tag_length
Title : tag_length
Usage : $obj->tag_length($newval)
Function:
Example : my $tag_length = $obj->tag_length
Returns : value of the length for each space-separated tag in a line
Args : newvalue (optional) - set to zero to have one tag per line
=cut
sub
tag_length {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_tag_length'
} =
$value
;
}
return
$self
->{
'_tag_length'
} ||
$DEFAULTTAGLEN
;
}
=head2 id_linebreak
Title : id_linebreak
Usage : $obj->id_linebreak($newval)
Function:
Returns : value of id_linebreak
Args : newvalue (optional)
=cut
sub
id_linebreak {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_id_linebreak'
} =
$value
;
}
return
$self
->{
'_id_linebreak'
} || 0;
}
=head2 wrap_sequential
Title : wrap_sequential
Usage : $obj->wrap_sequential($newval)
Function:
Returns : value of wrap_sequential
Args : newvalue (optional)
=cut
sub
wrap_sequential {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_wrap_sequential'
} =
$value
;
}
return
$self
->{
'_wrap_sequential'
} || 0;
}
=head2 longid
Title : longid
Usage : $obj->longid($newval)
Function:
Returns : value of longid
Args : newvalue (optional)
=cut
sub
longid {
my
(
$self
,
$value
) =
@_
;
if
(
defined
$value
) {
$self
->{
'_longid'
} =
$value
;
}
return
$self
->{
'_longid'
} || 0;
}
1;