# $Id: Newick.pm 1660 2011-04-02 18:29:40Z rvos $
package Bio::Phylo::Parsers::Newick;
use strict;
use base 'Bio::Phylo::Parsers::Abstract';
no warnings 'recursion';

=head1 NAME

Bio::Phylo::Parsers::Newick - Parser used by Bio::Phylo::IO, no serviceable parts inside

=head1 DESCRIPTION

This module parses tree descriptions in parenthetical
format. It is called by the L<Bio::Phylo::IO> facade,
don't call it directly.

=cut
sub _return_is_scalar { 1 }

sub _parse {
    my $self   = shift;
    my $fh     = $self->_handle;
    my $forest = $self->_factory->create_forest;
    my $string;
    while (<$fh>) {
        chomp;
        $string .= $_;
    }

    # remove comments, split on tree descriptions
    for my $newick ( $self->_split($string) ) {

        # parse trees
        my $tree = $self->_parse_string($newick);

        # adding labels to untagged nodes
        if ( $self->_args->{'-label'} ) {
            my $i = 1;
            $tree->visit(
                sub {
                    my $n = shift;
                    $n->set_name( 'n' . $i++ ) unless $n->get_name;
                }
            );
        }
        $forest->insert($tree);
    }
    return $forest;
}

=begin comment

 Type    : Parser
 Title   : _split($string)
 Usage   : my @strings = $newick->_split($string);
 Function: Creates an array of (decommented) tree descriptions
 Returns : A Bio::Phylo::Forest::Tree object.
 Args    : $string = concatenated tree descriptions

=end comment

=cut

sub _split {
    my ( $self, $string ) = @_;
    my ( $QUOTED, $COMMENTED ) = ( 0, 0 );
    my $decommented = '';
    my @trees;
  TOKEN: for my $i ( 0 .. length($string) ) {
        if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq "'" ) {
            $QUOTED++;
        }
        elsif ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq "[" ) {
            $COMMENTED++;
            next TOKEN;
        }
        elsif ( !$QUOTED && $COMMENTED && substr( $string, $i, 1 ) eq "]" ) {
            $COMMENTED--;
            next TOKEN;
        }
        elsif ($QUOTED
            && !$COMMENTED
            && substr( $string, $i, 1 ) eq "'"
            && substr( $string, $i, 2 ) ne "''" )
        {
            $QUOTED--;
        }
        $decommented .= substr( $string, $i, 1 ) unless $COMMENTED;
        if ( !$QUOTED && !$COMMENTED && substr( $string, $i, 1 ) eq ';' ) {
            push @trees, $decommented;
            $decommented = '';
        }
    }
    $self->_logger->debug("removed comments, split on tree descriptions");
    return @trees;
}

=begin comment

 Type    : Parser
 Title   : _parse_string($string)
 Usage   : my $tree = $newick->_parse_string($string);
 Function: Creates a populated Bio::Phylo::Forest::Tree object from a newick
           string.
 Returns : A Bio::Phylo::Forest::Tree object.
 Args    : $string = a newick tree description

=end comment

=cut

sub _parse_string {
    my ( $self, $string ) = @_;
    my $fac = $self->_factory;
    $self->_logger->debug("going to parse tree string '$string'");
    my $tree      = $fac->create_tree;
    my $remainder = $string;
    my $token;
    my @tokens;
    while ( ( $token, $remainder ) = $self->_next_token($remainder) ) {
        last if ( !defined $token || !defined $remainder );
        $self->_logger->debug("fetched token '$token'");
        push @tokens, $token;
    }
    my $i;
    for ( $i = $#tokens ; $i >= 0 ; $i-- ) {
        last if $tokens[$i] eq ';';
    }
    my $root = $fac->create_node;
    $tree->insert($root);
    $self->_parse_node_data( $root, @tokens[ 0 .. ( $i - 1 ) ] );
    $self->_parse_clade( $tree, $root, @tokens[ 0 .. ( $i - 1 ) ] );
    return $tree;
}

sub _parse_clade {
    my ( $self, $tree, $root, @tokens ) = @_;
    my $fac = $self->_factory;
    $self->_logger->debug("recursively parsing clade '@tokens'");
    my ( @clade, $depth, @remainder );
  TOKEN: for my $i ( 0 .. $#tokens ) {
        if ( $tokens[$i] eq '(' ) {
            if ( not defined $depth ) {
                $depth = 1;
                next TOKEN;
            }
            else {
                $depth++;
            }
        }
        elsif ( $tokens[$i] eq ',' && $depth == 1 ) {
            my $node = $fac->create_node;
            $root->set_child($node);
            $tree->insert($node);
            $self->_parse_node_data( $node, @clade );
            $self->_parse_clade( $tree, $node, @clade );
            @clade = ();
            next TOKEN;
        }
        elsif ( $tokens[$i] eq ')' ) {
            $depth--;
            if ( $depth == 0 ) {
                @remainder = @tokens[ ( $i + 1 ) .. $#tokens ];
                my $node = $fac->create_node;
                $root->set_child($node);
                $tree->insert($node);
                $self->_parse_node_data( $node, @clade );
                $self->_parse_clade( $tree, $node, @clade );
                last TOKEN;
            }
        }
        push @clade, $tokens[$i];
    }
}

sub _parse_node_data {
    my ( $self, $node, @clade ) = @_;
    $self->_logger->debug("parsing name and branch length for node");
    my @tail;
  PARSE_TAIL: for ( my $i = $#clade ; $i >= 0 ; $i-- ) {
        if ( $clade[$i] eq ')' ) {
            @tail = @clade[ ( $i + 1 ) .. $#clade ];
            last PARSE_TAIL;
        }
        elsif ( $i == 0 ) {
            @tail = @clade;
        }
    }

    # name only
    if ( scalar @tail == 1 ) {
        $node->set_name( $tail[0] );
    }
    elsif ( scalar @tail == 2 ) {
        $node->set_branch_length( $tail[-1] );
    }
    elsif ( scalar @tail == 3 ) {
        $node->set_name( $tail[0] );
        $node->set_branch_length( $tail[-1] );
    }
}

sub _next_token {
    my ( $self, $string ) = @_;
    $self->_logger->debug("tokenizing string '$string'");
    my $QUOTED          = 0;
    my $token           = '';
    my $TOKEN_DELIMITER = qr/[():,;]/;
  TOKEN: for my $i ( 0 .. length($string) ) {
        $token .= substr( $string, $i, 1 );
        $self->_logger->debug("growing token: '$token'");
        if ( !$QUOTED && $token =~ $TOKEN_DELIMITER ) {
            my $length = length($token);
            if ( $length == 1 ) {
                $self->_logger->debug("single char token: '$token'");
                return $token, substr( $string, ( $i + 1 ) );
            }
            else {
                $self->_logger->debug(
                    sprintf( "range token: %s",
                        substr( $token, 0, $length - 1 ) )
                );
                return substr( $token, 0, $length - 1 ),
                  substr( $token, $length - 1, 1 )
                  . substr( $string, ( $i + 1 ) );
            }
        }
        if ( !$QUOTED && substr( $string, $i, 1 ) eq "'" ) {
            $QUOTED++;
        }
        elsif ($QUOTED
            && substr( $string, $i, 1 ) eq "'"
            && substr( $string, $i, 2 ) ne "''" )
        {
            $QUOTED--;
        }
    }
}

# podinherit_insert_token

=head1 SEE ALSO

=over

=item L<Bio::Phylo::IO>

The newick parser is called by the L<Bio::Phylo::IO> object.
Look there to learn how to parse newick strings.

=item L<Bio::Phylo::Manual>

Also see the manual: L<Bio::Phylo::Manual> and L<http://rutgervos.blogspot.com>.

=back

=head1 CITATION

If you use Bio::Phylo in published research, please cite it:

B<Rutger A Vos>, B<Jason Caravas>, B<Klaas Hartmann>, B<Mark A Jensen>
and B<Chase Miller>, 2011. Bio::Phylo - phyloinformatic analysis using Perl.
I<BMC Bioinformatics> B<12>:63.
L<http://dx.doi.org/10.1186/1471-2105-12-63>

=head1 REVISION

 $Id: Newick.pm 1660 2011-04-02 18:29:40Z rvos $

=cut
1;