package Bio::MUST::Core::GeneticCode::Factory;
# ABSTRACT: Genetic code factory based on NCBI gc.prt file

use Moose;
use namespace::autoclean;

# AUTOGENERATED CODE! DO NOT MODIFY THIS FILE!

use autodie;
use feature qw(say);

use Carp;
use File::Spec;
use LWP::Simple qw(get);
use Path::Class qw(file);
use Try::Tiny;

use Bio::MUST::Core::Types;
use aliased 'Bio::MUST::Core::GeneticCode';


# public path to NCBI Taxonomy dump directory
has 'tax_dir' => (
    is       => 'ro',
    isa      => 'Bio::MUST::Core::Types::Dir',
    coerce   => 1,
);


# private hash hosting NCBI codes
has '_code_for' => (
    traits   => ['Hash'],
    is       => 'ro',
    isa      => 'HashRef[Bio::MUST::Core::GeneticCode]',
    init_arg => undef,
    lazy     => 1,
    builder  => '_build_code_for',
    handles  => {
             code_for => 'get',
        list_codes    => 'keys',
    },
);


## no critic (ProhibitUnusedPrivateSubroutines)

sub _build_code_for {
    my $self = shift;

    # split file content into code blocks
    my @codes = $self->_get_gcprt_content =~ m/ \{ ( [^{}]+ ) \} /xmsgc;
    croak "Error: cannot parse 'gc.prt' file; aborting!" unless @codes;

# Genetic-code-table ::= {
# ...
#  {
#     name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
#  Mitochondrial; Mycoplasma; Spiroplasma" ,
#   name "SGC3" ,
#   id 4 ,
#   ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#   sncbieaa "--MM---------------M------------MMMM---------------M------------"
#   -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#   -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#   -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
#  },
#  ...
# }
    my %code_for;

    for my $code (@codes) {

        # get all names and id for current code
        my ($id)      = $code =~ m/   id    \s*   (\d+)   /xms;
        my @names     = $code =~ m/ name    \s* \"(.*?)\" /xmsg;
        @names = map {       s{\n}{}xmsgr } @names;     # remove newline chars
        @names = map { split m{;\s*}xms }   @names;     # demultiplex names

        # retrieve the amino acid line
        my ($aa_line) = $code =~ m/ ncbieaa \s* \"(.*?)\" /xms;
        $aa_line =~ s{\*}{x}xmsg;               # make STOPs MUST-compliant

        # retrieve the three codon lines
        my ($b1_line) = $code =~ m/ Base1   \s* ([TACG]+) /xms;
        my ($b2_line) = $code =~ m/ Base2   \s* ([TACG]+) /xms;
        my ($b3_line) = $code =~ m/ Base3   \s* ([TACG]+) /xms;

        # split lines into aas and bases
        my @aas    = split //, $aa_line;
        my @bases1 = split //, $b1_line;
        my @bases2 = split //, $b2_line;
        my @bases3 = split //, $b3_line;

        # build translation table for current code
        my %aa_for = map {
            join( '', $bases1[$_], $bases2[$_], $bases3[$_] ) => $aas[$_]
        } 0..$#aas;

        # add gap 'codons' to code
        $aa_for{'***'} = q{*};
        $aa_for{'---'} = q{*};
        $aa_for{'   '} = q{ };

        # store translation table under its various id and names
        $code_for{$_} = GeneticCode->new(
            ncbi_id => $id,
            _code   => \%aa_for
        ) for ($id, @names);
    }

    return \%code_for;
}

## use critic

# old version using a local or remote copy of NCBI gc.prt file
# sub _get_gcprt_content {
#     my $self = shift;
#
#     my $content;
#
#     # if available use local copy in NCBI Taxonomy dump
#     # otherwise try to fetch it from the NCBI FTP server
#     try   { $content = file($self->tax_dir, 'gc.prt')->slurp }
#     catch { $content = get('ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt') };
#
#     croak "Error: cannot read 'gc.prt' file; aborting!"
#         unless $content;
#
#     return $content;
# }

# new version based on templating
sub _get_gcprt_content {
    return <<'EOT';
[% gcprt %]
EOT
}

__PACKAGE__->meta->make_immutable;
1;

__END__

=head1 SYNOPSIS

    # TODO

=head1 DESCRIPTION

    # TODO