package Bio::MUST::Core::GeneticCode::Factory;
# ABSTRACT: Genetic code factory based on NCBI gc.prt file
use Moose;
use namespace::autoclean;
# AUTOGENERATED CODE! DO NOT MODIFY THIS FILE!
use autodie;
use feature qw(say);
use Carp;
use File::Spec;
use LWP::Simple qw(get);
use Path::Class qw(file);
use Try::Tiny;
use Bio::MUST::Core::Types;
use aliased 'Bio::MUST::Core::GeneticCode';
# public path to NCBI Taxonomy dump directory
has 'tax_dir' => (
is => 'ro',
isa => 'Bio::MUST::Core::Types::Dir',
coerce => 1,
);
# private hash hosting NCBI codes
has '_code_for' => (
traits => ['Hash'],
is => 'ro',
isa => 'HashRef[Bio::MUST::Core::GeneticCode]',
init_arg => undef,
lazy => 1,
builder => '_build_code_for',
handles => {
code_for => 'get',
list_codes => 'keys',
},
);
## no critic (ProhibitUnusedPrivateSubroutines)
sub _build_code_for {
my $self = shift;
# split file content into code blocks
my @codes = $self->_get_gcprt_content =~ m/ \{ ( [^{}]+ ) \} /xmsgc;
croak "Error: cannot parse 'gc.prt' file; aborting!" unless @codes;
# Genetic-code-table ::= {
# ...
# {
# name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
# Mitochondrial; Mycoplasma; Spiroplasma" ,
# name "SGC3" ,
# id 4 ,
# ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
# sncbieaa "--MM---------------M------------MMMM---------------M------------"
# -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
# -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
# -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# ...
# }
my %code_for;
for my $code (@codes) {
# get all names and id for current code
my ($id) = $code =~ m/ id \s* (\d+) /xms;
my @names = $code =~ m/ name \s* \"(.*?)\" /xmsg;
@names = map { s{\n}{}xmsgr } @names; # remove newline chars
@names = map { split m{;\s*}xms } @names; # demultiplex names
# retrieve the amino acid line
my ($aa_line) = $code =~ m/ ncbieaa \s* \"(.*?)\" /xms;
$aa_line =~ s{\*}{x}xmsg; # make STOPs MUST-compliant
# retrieve the three codon lines
my ($b1_line) = $code =~ m/ Base1 \s* ([TACG]+) /xms;
my ($b2_line) = $code =~ m/ Base2 \s* ([TACG]+) /xms;
my ($b3_line) = $code =~ m/ Base3 \s* ([TACG]+) /xms;
# split lines into aas and bases
my @aas = split //, $aa_line;
my @bases1 = split //, $b1_line;
my @bases2 = split //, $b2_line;
my @bases3 = split //, $b3_line;
# build translation table for current code
my %aa_for = map {
join( '', $bases1[$_], $bases2[$_], $bases3[$_] ) => $aas[$_]
} 0..$#aas;
# add gap 'codons' to code
$aa_for{'***'} = q{*};
$aa_for{'---'} = q{*};
$aa_for{' '} = q{ };
# store translation table under its various id and names
$code_for{$_} = GeneticCode->new(
ncbi_id => $id,
_code => \%aa_for
) for ($id, @names);
}
return \%code_for;
}
## use critic
# old version using a local or remote copy of NCBI gc.prt file
# sub _get_gcprt_content {
# my $self = shift;
#
# my $content;
#
# # if available use local copy in NCBI Taxonomy dump
# # otherwise try to fetch it from the NCBI FTP server
# try { $content = file($self->tax_dir, 'gc.prt')->slurp }
# catch { $content = get('ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt') };
#
# croak "Error: cannot read 'gc.prt' file; aborting!"
# unless $content;
#
# return $content;
# }
# new version based on templating
sub _get_gcprt_content {
return <<'EOT';
[% gcprt %]
EOT
}
__PACKAGE__->meta->make_immutable;
1;
__END__
=head1 SYNOPSIS
# TODO
=head1 DESCRIPTION
# TODO