$Bio::MUST::Core::Taxonomy::VERSION
=
'0.250380'
;
with
Storage(
'io'
=>
'StorableFile'
);
qw(first firstidx uniq each_array mesh count_by max_by)
;
has
'tax_dir'
=> (
traits
=> [
'DoNotSerialize'
],
is
=>
'ro'
,
isa
=>
'Bio::MUST::Core::Types::Dir'
,
required
=> 1,
coerce
=> 1,
);
has
'_ncbi_tax'
=> (
is
=>
'ro'
,
isa
=>
'Bio::MUST::Core::Taxonomy::MooseNCBI'
,
lazy
=> 1,
builder
=>
'_build_ncbi_tax'
,
handles
=>
qr{get_\w+}
xms,
);
has
'_gi_mapper'
=> (
traits
=> [
'DoNotSerialize'
],
is
=>
'ro'
,
isa
=>
'Bio::LITE::Taxonomy::NCBI::Gi2taxid'
,
lazy
=> 1,
builder
=>
'_build_gi_mapper'
,
handles
=> {
get_taxid_from_gi
=>
'get_taxid'
,
},
);
has
'_is_deleted'
=> (
traits
=> [
'Hash'
],
is
=>
'ro'
,
isa
=>
'HashRef[Bool]'
,
lazy
=> 1,
builder
=>
'_build_is_deleted'
,
handles
=> {
'is_deleted'
=>
'defined'
,
},
);
has
'_'
.
$_
.
'_for'
=> (
traits
=> [
'Hash'
],
is
=>
'ro'
,
isa
=>
'HashRef[Str]'
,
lazy
=> 1,
builder
=>
'_build_'
.
$_
.
'_for'
,
handles
=> {
'is_'
.
$_
=>
'defined'
,
$_
.
'_for'
=>
'get'
,
},
)
for
qw(merged misleading)
;
has
'_dupes_for'
=> (
traits
=> [
'Hash'
],
is
=>
'ro'
,
isa
=>
'HashRef[HashRef[Str]]'
,
lazy
=> 1,
builder
=>
'_build_dupes_for'
,
handles
=> {
'is_dupe'
=>
'defined'
,
'dupes_for'
=>
'get'
,
},
);
has
'_rank_for'
=> (
traits
=> [
'Hash'
],
is
=>
'ro'
,
isa
=>
'HashRef[Str]'
,
lazy
=> 1,
builder
=>
'_build_rank_for'
,
handles
=> {
'rank_for'
=>
'get'
,
},
);
has
'_strain_taxid_for'
=> (
traits
=> [
'Hash'
],
is
=>
'ro'
,
isa
=>
'HashRef[HashRef[Str]]'
,
lazy
=> 1,
builder
=>
'_build_strain_taxid_for'
,
handles
=> {
'get_strain_taxid_for'
=>
'get'
,
},
);
has
'_matcher'
=> (
traits
=> [
'DoNotSerialize'
],
is
=>
'ro'
,
isa
=>
'Algorithm::NeedlemanWunsch'
,
init_arg
=>
undef
,
lazy
=> 1,
builder
=>
'_build_matcher'
,
handles
=> {
nw_score
=>
'align'
,
},
);
sub
_build_ncbi_tax {
my
$self
=
shift
;
local
$_
=
q{}
;
my
$names_txid
= file(
$self
->tax_dir,
'names*.dmp'
);
my
$names_gca
= file(
$self
->tax_dir,
'gca*names.dmp'
);
my
$nodes_txid
= file(
$self
->tax_dir,
'nodes*.dmp'
);
my
$nodes_gca
= file(
$self
->tax_dir,
'gca*nodes.dmp'
);
my
@names_files
= (
$names_gca
,
$names_txid
);
my
@nodes_files
= (
$nodes_txid
,
$nodes_gca
);
my
$ol
=
q{'if (index($_, "scientific name") == -1) { print }
else
{
push
@sns
,
$_
} END{
print
for
@sns
}'};
open
my
$names_fh
,
'-|'
,
qq{perl -nle $ol @names_files 2> /dev/null}
;
open
my
$nodes_fh
,
'-|'
,
qq{cat @nodes_files 2> /dev/null}
;
return
MooseNCBI->new(
names
=>
$names_fh
,
nodes
=>
$nodes_fh
,
synonyms
=> [
'synonym'
,
'genbank synonym'
,
'acronym'
,
'genbank acronym'
,
'anamorph'
,
'genbank anamorph'
,
'teleomorph'
,
'blast name'
,
'common name'
,
'genbank common name'
,
'equivalent name'
,
'includes'
,
]
);
}
sub
_build_gi_mapper {
my
$self
=
shift
;
local
$_
=
q{}
;
return
Bio::LITE::Taxonomy::NCBI::Gi2taxid->new(
dict
=> file(
$self
->tax_dir,
'gi_taxid_nucl_prot.bin'
),
save_mem
=> 1,
);
}
sub
_build_is_deleted {
my
$self
=
shift
;
my
%is_deleted
;
my
$infile
= file(
$self
->tax_dir,
'delnodes.dmp'
);
open
my
$in
,
'<'
,
$infile
;
while
(
my
$line
= <
$in
>) {
my
(
$taxon_id
) =
$line
=~ m/^(\d+)/xms;
$is_deleted
{
$taxon_id
} = 1;
}
return
\
%is_deleted
;
}
sub
_build_merged_for {
my
$self
=
shift
;
my
%merged_for
;
my
$infile
= file(
$self
->tax_dir,
'merged.dmp'
);
open
my
$in
,
'<'
,
$infile
;
while
(
my
$line
= <
$in
>) {
chomp
$line
;
my
(
$old_taxid
,
$new_taxid
) =
split
/\s*\|\s*/xms,
$line
;
$merged_for
{
$old_taxid
} =
$new_taxid
;
}
return
\
%merged_for
;
}
sub
_build_misleading_for {
my
$self
=
shift
;
my
%misleading_for
;
my
$misleadings
= file(
$self
->tax_dir,
'misleading*.dmp'
);
open
my
$in
,
'-|'
,
"cat $misleadings 2> /dev/null"
;
while
(
my
$line
= <
$in
>) {
chomp
$line
;
my
(
$taxid
,
$full_org
) =
split
/\s*\|\s*/xms,
$line
;
$misleading_for
{
$full_org
} =
$taxid
;
}
return
\
%misleading_for
;
}
sub
_build_dupes_for {
my
$self
=
shift
;
my
%taxids_for
;
my
$infile
= file(
$self
->tax_dir,
'names.dmp'
);
open
my
$in
,
'<'
,
$infile
;
LINE:
while
(
my
$line
= <
$in
>) {
next
LINE
unless
$line
=~ m/scientific \s name/xms;
chomp
$line
;
my
(
$taxon_id
,
$taxon
) = (
split
/\s*\|\s*/xms,
$line
)[0..1];
push
@{
$taxids_for
{
$taxon
} },
$taxon_id
;
}
my
%dupes_for
;
my
$tax
=
$self
->new(
tax_dir
=>
$self
->tax_dir );
TAXON:
while
(
my
(
$taxon
,
$taxids
) =
each
%taxids_for
) {
next
TAXON
if
@{
$taxids
} < 2;
no
warnings
'uninitialized'
;
my
%taxid_for
=
map
{
(
join
q{; }
, (
$tax
->get_taxonomy(
$_
) )[-5..-1] ) =>
$_
} @{
$taxids
};
carp
"[BMC] Note: $taxon cannot be disambiguated."
.
' This should not be an issue.'
if
keys
%taxid_for
< @{
$taxids
};
$dupes_for
{
$taxon
} = \
%taxid_for
;
}
return
\
%dupes_for
;
}
const
my
@LEVELS
=> (
'skip'
,
'top'
,
'superkingdom'
,
'kingdom'
,
'subkingdom'
,
'superphylum'
,
'phylum'
,
'subphylum'
,
'superclass'
,
'class'
,
'subclass'
,
'infraclass'
,
'cohort'
,
'subcohort'
,
'superorder'
,
'order'
,
'suborder'
,
'infraorder'
,
'parvorder'
,
'superfamily'
,
'family'
,
'subfamily'
,
'tribe'
,
'subtribe'
,
'genus'
,
'subgenus'
,
'section'
,
'subsection'
,
'series'
,
'species group'
,
'species subgroup'
,
'species'
,
'subspecies'
,
'subvariety'
,
'varietas'
,
'morph'
,
'forma'
,
'forma specialis'
,
'isolate'
,
'strain'
,
'pathogroup'
,
'serogroup'
,
'serotype'
,
'biotype'
,
'genotype'
,
'clade'
,
'no rank'
,
);
sub
_build_rank_for {
my
$i
= 1;
return
{
map
{
$_
=>
$i
-- }
@LEVELS
};
}
sub
_build_strain_taxid_for {
my
$self
=
shift
;
my
$names_txid
= file(
$self
->tax_dir,
'names*.dmp'
);
my
$names_gca
= file(
$self
->tax_dir,
'gca*names.dmp'
);
my
@names_files
= (
$names_gca
,
$names_txid
);
open
my
$names_fh
,
'-|'
,
"cat @names_files"
;
my
%strain_taxid_for
;
LINE:
while
(
my
$line
= <
$names_fh
>) {
chomp
$line
;
my
(
$taxid
,
$org
) =
split
/\s*\|\s*/xms,
$line
;
my
(
$genus
,
$species
,
$strain
) = SeqId->parse_ncbi_name(
$org
);
next
LINE
unless
$strain
;
my
$org_key
= _make_hashkey(
$genus
,
$species
);
my
$strain_key
= _clean_strain(
$strain
);
$strain_taxid_for
{
$org_key
}{
$strain_key
} =
$taxid
if
defined
$strain_key
and
length
$strain_key
;
}
return
\
%strain_taxid_for
;
}
sub
_clean_strain {
my
$strain
=
shift
;
$strain
=~ s{\b
substr
\b}{}xmsgi;
$strain
=~ s{\b strain \b}{}xmsgi;
$strain
=~ s{\b subsp \b}{}xmsgi;
$strain
=~ s{\b str \b}{}xmsgi;
$strain
=~
tr
/A-Za-z0-9//cd;
return
lc
$strain
;
}
sub
_make_hashkey {
my
$genus
=
shift
;
my
$species
=
shift
;
$species
=~
tr
/-//d;
my
$org
=
lc
(
$genus
.
q{ }
.
$species
);
return
$org
;
}
const
my
$MATCHYOU
=> 3;
const
my
$MISMATCH
=> -1;
const
my
$GAP_OPEN
=> -2;
const
my
$GAP_EXTEND
=> 0;
sub
_build_matcher {
my
$self
=
shift
;
local
$_
=
q{}
;
my
$matcher
= Algorithm::NeedlemanWunsch->new( \
&_matcher_matrix
);
$matcher
->gap_open_penalty(
$GAP_OPEN
);
$matcher
->gap_extend_penalty(
$GAP_EXTEND
);
return
$matcher
;
}
const
my
$GAP_CODE
=>
'*'
;
const
my
$MATCH_CODE
=>
'.'
;
const
my
$MISMATCH_CODE
=>
':'
;
const
my
%SCORE_FOR
=> (
$GAP_CODE
=>
$GAP_OPEN
,
$MATCH_CODE
=>
$MATCHYOU
,
$MISMATCH_CODE
=>
$MISMATCH
,
);
sub
_matcher_matrix {
return
$SCORE_FOR
{ _match_code(
@_
) };
}
sub
_match_code {
return
$GAP_CODE
unless
@_
;
if
(
$_
[0] =~ m/ [^A-Za-z0-9] /xms &&
$_
[1] =~ m/ [A-Za-z0-9] /xms
or
$_
[0] =~ m/ [A-Za-z0-9] /xms &&
$_
[1] =~ m/ [^A-Za-z0-9] /xms ) {
return
$GAP_CODE
;
}
if
(
$_
[0] =~ m/ [A-Za-z0-9] /xms ||
$_
[1] =~ m/ [A-Za-z0-9] /xms ) {
return
(
lc
$_
[0] eq
lc
$_
[1] ) ?
$MATCH_CODE
:
$MISMATCH_CODE
;
}
return
$MATCH_CODE
;
}
around
qr{ _from_seq_id \z | _from_legacy_seq_id \z }
xms
=>
sub
{
my
$method
=
shift
;
my
$self
=
shift
;
my
$seq_id
=
shift
;
match_on_type
$seq_id
=> (
'Bio::MUST::Core::Types::Lineage'
=>
sub
{ },
'Str'
=>
sub
{
$seq_id
= SeqId->new(
full_id
=>
$seq_id
)
},
=>
sub
{ },
);
return
$self
->
$method
(
$seq_id
,
@_
);
};
around
qr{ _from_name \z }
xms
=>
sub
{
my
$method
=
shift
;
my
$self
=
shift
;
my
$name
=
shift
;
return
undef
unless
$name
;
if
(
$self
->is_dupe(
$name
) ) {
carp
"[BMC] Warning: $name is taxonomically ambiguous;"
.
' returning undef!'
;
return
undef
;
}
return
$self
->
$method
(
$name
);
};
my
%was_noted
;
around
qw( get_taxonomy get_taxonomy_with_levels get_term_at_level )
=>
sub
{
my
$method
=
shift
;
my
$self
=
shift
;
my
$taxon_id
=
shift
;
if
(
defined
$taxon_id
&&
$self
->is_merged(
$taxon_id
) ) {
my
$noted
=
$was_noted
{
$taxon_id
};
$was_noted
{
$taxon_id
} //= 1;
my
$msg
=
"[BMC] Note: merged taxid for $taxon_id;"
;
$taxon_id
=
$self
->merged_for(
$taxon_id
);
carp
"$msg using $taxon_id instead!"
unless
$noted
;
}
$taxon_id
=~
tr
/F/A/
if
defined
$taxon_id
&&
$taxon_id
=~
$GCAONLY
;
return
$self
->
$method
(
$taxon_id
,
@_
);
};
sub
get_taxid_from_seq_id {
my
$self
=
shift
;
my
$seq_id
=
shift
;
return
undef
if
$seq_id
->is_doubtful;
unless
(
$seq_id
->is_foreign ) {
my
$taxon_id
=
$self
->misleading_for(
$seq_id
->full_org );
return
$taxon_id
if
$taxon_id
;
}
return
$seq_id
->taxon_id
if
$seq_id
->taxon_id;
if
(
$seq_id
->is_foreign ) {
return
$self
->get_taxid_from_gi(
$seq_id
->gi )
if
$seq_id
->gi;
return
$self
->get_taxid_from_name(
$seq_id
->full_id )
}
if
(
$seq_id
->strain ) {
my
$taxon_id
=
$self
->get_taxid_from_legacy_seq_id(
$seq_id
);
return
$taxon_id
if
$taxon_id
;
}
unless
(
$seq_id
->is_genus_only ) {
my
$taxon_id
=
$self
->get_taxid_from_name(
$seq_id
->org );
return
$taxon_id
if
$taxon_id
;
}
return
$self
->get_taxid_from_name(
$seq_id
->genus );
}
sub
get_taxid_from_legacy_seq_id {
my
$self
=
shift
;
my
$seq_id
=
shift
;
my
$org_key
= _make_hashkey(
$seq_id
->genus,
$seq_id
->species );
my
$query_strain
=
$seq_id
->strain;
return
undef
unless
$query_strain
;
$query_strain
=
lc
SeqId->clean_strain(
$query_strain
);
my
$taxid_for
=
$self
->_strain_taxid_for->{
$org_key
};
return
undef
unless
$taxid_for
;
my
$taxid
=
$taxid_for
->{
$query_strain
};
return
$taxid
if
$taxid
;
my
@sbjct_strains
=
sort
{
$a
cmp
$b
}
keys
%{
$taxid_for
};
my
@split_query_strain
=
split
//xms,
$query_strain
;
my
@split_sbjct_strains
=
map
{ [
split
//xms ] }
@sbjct_strains
;
my
@scores
=
map
{
$self
->nw_score( \
@split_query_strain
,
$_
)
}
@split_sbjct_strains
;
my
$max_score
= List::AllUtils::max
@scores
;
my
$max_index
= firstidx {
$_
==
$max_score
}
@scores
;
my
$max_strain
=
$sbjct_strains
[
$max_index
];
my
$threshold
= List::AllUtils::min(
length
$query_strain
,
length
$max_strain
) *
$MATCHYOU
+
$GAP_OPEN
;
return
undef
if
$max_score
<
$threshold
;
return
$taxid_for
->{
$max_strain
};
}
sub
get_taxonomy_from_seq_id {
my
$self
=
shift
;
my
$seq_id
=
shift
;
return
match_on_type
$seq_id
=> (
'Bio::MUST::Core::SeqId'
=>
sub
{
$self
->_taxonomy_from_seq_id_(0,
$seq_id
);
},
'Bio::MUST::Core::Types::Lineage'
=>
sub
{
my
@taxonomy
=
split
qr{;\ *}
xms,
$seq_id
;
wantarray
?
@taxonomy
: \
@taxonomy
;
},
ArrayRef
=>
sub
{
wantarray
? @{
$seq_id
} :
$seq_id
},
);
}
sub
fetch_lineage {
return
shift
->get_taxonomy_from_seq_id(
@_
);
}
sub
get_taxid_from_taxonomy {
my
$self
=
shift
;
my
@taxonomy
=
$self
->get_taxonomy_from_seq_id(
@_
);
while
(
@taxonomy
) {
my
$taxon
=
$taxonomy
[-1];
my
$taxon_id
=
$self
->get_taxid_from_name(
$taxon
);
return
$taxon_id
if
$taxon_id
;
my
$dupes_for
=
$self
->dupes_for(
$taxon
);
no
warnings
'uninitialized'
;
$taxon_id
=
$dupes_for
->{
join
q{; }
,
@taxonomy
[-5..-1] };
if
(
$taxon_id
) {
carp
"[BMC] Note: managed to disambiguate $taxon based on lineage!"
;
return
$taxon_id
;
}
pop
@taxonomy
;
carp
"[BMC] Note: trying to identify $taxon by following lineage..."
if
@taxonomy
;
}
return
undef
;
}
sub
get_taxonomy_with_levels_from_seq_id {
return
shift
->_taxonomy_from_seq_id_(1,
@_
);
}
sub
_taxonomy_from_seq_id_ {
my
$self
=
shift
;
my
$levels
=
shift
;
my
$seq_id
=
shift
;
my
$taxon_id
=
$self
->get_taxid_from_seq_id(
$seq_id
);
my
@taxonomy
=
$levels
?
$self
->get_taxonomy_with_levels(
$taxon_id
)
:
$self
->get_taxonomy(
$taxon_id
);
@taxonomy
= ()
unless
$taxonomy
[0];
carp
'[BMC] Warning: cannot fetch tax for '
. (
$seq_id
->full_id ||
q{''}
)
.
'; ignoring it!'
unless
@taxonomy
;
return
wantarray
?
@taxonomy
: \
@taxonomy
;
}
sub
get_taxa_from_taxid {
my
$self
=
shift
;
my
$taxon_id
=
shift
;
my
@taxa
=
map
{
$self
->get_term_at_level(
$taxon_id
,
$_
) }
@_
;
return
wantarray
?
@taxa
: \
@taxa
;
}
sub
get_nexus_label_from_seq_id {
my
$self
=
shift
;
my
$seq_id
=
shift
;
my
$args
=
shift
// {};
my
@lineage
=
$self
->get_taxonomy_from_seq_id(
$seq_id
);
my
$org
=
pop
@lineage
//
$seq_id
->full_org(
q{ }
);
my
$acc
=
$args
->{append_acc} ?
$seq_id
->accession //
$seq_id
->gi :
undef
;
my
$label
=
defined
$org
&&
defined
$acc
?
"$org [$acc]"
:
defined
$org
?
$org
:
$seq_id
->full_id
;
return
SeqId->new(
full_id
=>
$label
)->nexus_id;
}
sub
get_common_taxonomy_from_seq_ids {
my
$self
=
shift
;
my
@seq_ids
=
@_
;
my
$threshold
= looks_like_number
$seq_ids
[0] ?
shift
@seq_ids
: 1.0;
my
@lineages
=
map
{
scalar
$self
->get_taxonomy_from_seq_id(
$_
)
}
@seq_ids
;
return
$self
->_common_taxonomy(
$threshold
,
@lineages
);
}
sub
compute_lca {
return
shift
->get_common_taxonomy_from_seq_ids(
@_
);
}
sub
_common_taxonomy {
my
$self
=
shift
;
my
$threshold
=
shift
;
my
@lineages
=
@_
;
@lineages
=
grep
{ @{
$_
} ?
$_
: () }
@lineages
;
my
@common_lineage
;
my
$n
=
@lineages
;
TAXON:
for
(
my
$i
= 0;
$n
;
$i
++) {
my
%count_for
= count_by {
$_
->[
$i
] //
q{}
}
@lineages
;
my
$taxon
= max_by {
$count_for
{
$_
} }
keys
%count_for
;
last
TAXON
unless
$taxon
;
my
$taxon_n
=
$count_for
{
$taxon
};
last
TAXON
if
$taxon_n
/
$n
<
$threshold
;
push
@common_lineage
,
$taxon
;
@lineages
=
grep
{ (
$_
->[
$i
] //
q{}
) eq
$taxon
}
@lineages
;
}
return
wantarray
?
@common_lineage
: \
@common_lineage
;
}
sub
attach_taxonomies_to_terminals {
my
$self
=
shift
;
my
$tree
=
shift
;
$tree
=
$tree
->tree
if
$tree
->isa(
'Bio::MUST::Core::Tree'
);
for
my
$tip
( @{
$tree
->get_terminals } ) {
my
@tax
=
$self
->get_taxonomy_with_levels_from_seq_id(
$tip
->get_name);
$tip
->set_generic(
'taxonomy'
=> [
map
{
$_
->[0] }
@tax
] );
$tip
->set_generic(
'levels'
=> [
map
{
$_
->[1] }
@tax
] );
}
return
;
}
sub
attach_taxonomies_to_internals {
my
$self
=
shift
;
my
$tree
=
shift
;
$tree
->tree->visit_depth_first(
-post
=>
sub
{
my
$node
=
shift
;
return
if
$node
->is_terminal;
my
@lineages
;
my
@max_levels
;
for
(
my
$i
= 0;
my
$child
=
$node
->get_child(
$i
);
$i
++) {
push
@lineages
,
$child
->get_generic(
'taxonomy'
);
my
@levels
= @{
$child
->get_generic(
'levels'
) };
@max_levels
=
@levels
if
@levels
>
@max_levels
;
}
my
@common_lineage
=
$self
->_common_taxonomy(1.0,
@lineages
);
$node
->set_generic(
'taxonomy'
=> \
@common_lineage
);
$node
->set_generic(
'levels'
=> [
@max_levels
[0..
$#common_lineage
] ]
);
return
;
},
);
return
;
}
sub
attach_taxa_to_entities {
my
$self
=
shift
;
my
$tree
=
shift
;
my
$args
=
shift
// {};
my
%arg_levels
= (
name
=>
'no rank'
,
collapse
=>
'skip'
, %{
$args
} );
my
%target_for
=
map
{
$_
=>
$self
->rank_for(
$arg_levels
{
$_
} )
}
keys
%arg_levels
;
if
(
$target_for
{collapse} <
$target_for
{name}) {
carp
'[BMC] Warning: collapsing level must include naming level;'
.
' upgrading!'
;
$target_for
{collapse} =
$target_for
{name};
}
NODE:
for
my
$node
( @{
$tree
->tree->get_entities } ) {
my
$is_named
;
$node
->set_generic(
'taxon'
=>
undef
);
$node
->set_generic(
'taxon_collapse'
=>
undef
);
my
@lineage
= @{
$node
->get_generic(
'taxonomy'
) };
my
@levels
= @{
$node
->get_generic(
'levels'
) };
while
(
my
$taxon
=
pop
@lineage
) {
my
$rank
=
$self
->rank_for(
pop
@levels
);
if
(!
$is_named
&&
$rank
>=
$target_for
{name}) {
$node
->set_generic(
'taxon'
=>
$taxon
);
$is_named
= 1;
}
if
(
$rank
==
$target_for
{collapse}) {
$node
->set_generic(
'taxon_collapse'
=>
$taxon
);
next
NODE;
}
}
}
return
;
}
sub
gi_mapper {
my
$self
=
shift
;
my
$listable
=
shift
;
return
IdMapper->new(
long_ids
=> [
$self
->_taxids_from_gis(
$listable
) ],
abbr_ids
=> [
map
{
$_
->full_id }
$listable
->all_seq_ids ],
);
}
const
my
$BATCH_SIZE
=> 252;
const
my
$GOLDEN_RATIO
=> 1.618;
const
my
$MAX_ATTEMPT
=> 3;
sub
_taxids_from_gis {
my
$self
=
shift
;
my
$listable
=
shift
;
my
@seq_ids
=
$listable
->all_seq_ids;
my
@full_ids
=
map
{
$_
->full_id }
@seq_ids
;
my
@gis
=
map
{
$_
->gi }
@seq_ids
;
my
@uniq_gis
= uniq
@gis
;
my
%taxid_for
;
try
{
%taxid_for
=
map
{
$_
=>
$self
->get_taxid_from_gi(
$_
) }
@uniq_gis
;
}
catch
{
};
my
@miss_gis
=
grep
{ not
$taxid_for
{
$_
} }
@uniq_gis
;
if
(
@miss_gis
) {
}
my
%miss_taxid_for
;
my
$batch_size
=
$BATCH_SIZE
;
my
$attempt
= 1;
ESUMMARY:
while
(
@miss_gis
) {
my
$total
=
@miss_gis
;
$batch_size
= List::AllUtils::min(
$batch_size
,
$total
);
BATCH:
for
(
my
$i
= 0;
$i
<
$total
;
$i
+=
$batch_size
) {
my
$end
= List::AllUtils::min(
$i
+
$batch_size
,
$#miss_gis
);
my
$report
= get(
.
'db=protein&id='
.
join
','
,
@miss_gis
[
$i
..
$end
]
);
last
BATCH
unless
$report
;
my
$ob
= XML::Bare->new(
text
=>
$report
);
my
$root
=
$ob
->parse();
for
my
$docsum
( @{
$root
->{eSummaryResult}{DocSum} } ) {
my
@fields
= @{
$docsum
->{Item} };
my
$gi
= first {
$_
->{Name}->{value} eq
'Gi'
}
@fields
;
my
$taxid
= first {
$_
->{Name}->{value} eq
'TaxId'
}
@fields
;
$miss_taxid_for
{
$gi
->{value} } =
$taxid
->{value};
}
}
%taxid_for
= (
%taxid_for
,
%miss_taxid_for
);
@miss_gis
=
grep
{ not
$taxid_for
{
$_
} }
@uniq_gis
;
if
(
@miss_gis
) {
if
(++
$attempt
>
$MAX_ATTEMPT
) {
$batch_size
= floor(
$batch_size
/
$GOLDEN_RATIO
);
$attempt
= 1;
}
unless
(
$batch_size
) {
carp
'[BMC] Warning: no answer from NCBI E-utilities;'
.
' dropping GIs!'
;
last
ESUMMARY;
}
}
}
my
@taxids_gis
;
my
$ea
= each_array(
@full_ids
,
@gis
);
while
(
my
(
$full_id
,
$gi
) =
$ea
->() ) {
push
@taxids_gis
,
$taxid_for
{
$gi
} .
'|'
.
$full_id
;
}
return
@taxids_gis
;
}
sub
tab_mapper {
my
$self
=
shift
;
my
$infile
=
shift
;
my
$args
=
shift
// {};
my
$col
=
$args
->{column} // 1;
my
$sep
=
$args
->{separator} //
qr{\t}
xms;
my
$idm
=
$args
->{gi2taxid};
open
my
$in
,
'<'
,
$infile
;
tie
my
%family_for
,
'Tie::IxHash'
;
LINE:
while
(
my
$line
= <
$in
>) {
chomp
$line
;
next
LINE
if
$line
=~
$EMPTY_LINE
||
$line
=~
$COMMENT_LINE
;
my
@fields
=
split
$sep
,
$line
;
$family_for
{
$fields
[0] } =
$fields
[
$col
];
}
my
@abbr_ids
=
keys
%family_for
;
my
@must_ids
=
@abbr_ids
;
if
(
$idm
) {
my
$gi_mapper
= IdMapper->load(
$idm
);
my
@gis
=
map
{
$_
->gi }
$gi_mapper
->all_abbr_seq_ids;
my
@taxon_ids
=
map
{
$_
->taxon_id }
$gi_mapper
->all_long_seq_ids;
my
%taxid_for
= mesh
@gis
,
@taxon_ids
;
for
my
$id
(
@must_ids
) {
my
$taxon_id
=
$taxid_for
{ SeqId->new(
full_id
=>
$id
)->gi };
my
@taxonomy
=
$self
->get_taxonomy(
$taxon_id
);
my
$org
=
$taxonomy
[-1];
$id
= SeqId->new_with(
org
=>
$org
,
taxon_id
=>
$taxon_id
,
gi
=>
$id
,
)->full_id;
}
}
my
@long_ids
;
my
$ea
= each_array
@must_ids
,
@abbr_ids
;
while
(
my
(
$must_id
,
$abbr_id
) =
$ea
->() ) {
(
my
$family
=
$family_for
{
$abbr_id
} //
q{}
) =~
tr
/-
@_
/./;
$family
.=
'-'
if
$family
;
push
@long_ids
,
$family
.
$must_id
;
}
return
IdMapper->new(
long_ids
=> \
@long_ids
,
abbr_ids
=> \
@abbr_ids
,
);
}
sub
tax_mapper {
my
$self
=
shift
;
my
$listable
=
shift
;
return
IdMapper->new(
long_ids
=> [
map
{
$self
->get_nexus_label_from_seq_id(
$_
,
@_
)
}
$listable
->all_seq_ids ],
abbr_ids
=> [
map
{
$_
->full_id }
$listable
->all_seq_ids ],
);
}
sub
tax_filter {
my
$self
=
shift
;
my
$list
=
shift
;
return
Filter->new(
tax
=>
$self
,
_specs
=>
$list
);
}
sub
tax_criterion {
my
$self
=
shift
;
my
$args
=
shift
;
$args
->{tax_filter} =
$self
->tax_filter(
$args
->{tax_filter} );
return
Criterion->new(
$args
);
}
sub
tax_category {
my
$self
=
shift
;
my
$args
=
shift
;
$args
->{criteria} = [
map
{
$self
->tax_criterion(
$_
) } @{
$args
->{criteria} }
];
return
Category->new(
$args
);
}
sub
tax_classifier {
my
$self
=
shift
;
my
$args
=
shift
;
$args
->{categories} = [
map
{
$self
->tax_category(
$_
) } @{
$args
->{categories} }
];
return
Classifier->new(
$args
);
}
sub
tax_labeler_from_systematic_frame {
my
$self
=
shift
;
my
$infile
=
shift
;
my
@lines
= file(
$infile
)->slurp;
(
my
$newick_str
=
pop
@lines
) =~ s/(?: :-?(\d+) ){3}/:1/xmsg;
my
$tree
= parse(
-format
=>
'newick'
,
-string
=>
$newick_str
,
-keep_whitespace
=> 1,
)->first;
my
@labels
=
map
{
$_
->get_name } @{
$tree
->get_terminals };
return
$self
->tax_labeler_from_list( \
@labels
);
}
sub
tax_labeler_from_list {
my
$self
=
shift
;
my
$list
=
shift
;
return
Labeler->new(
tax
=>
$self
,
labels
=>
$list
);
}
sub
load_color_scheme {
my
$self
=
shift
;
my
$scheme
= ColorScheme->new(
tax
=>
$self
);
return
$scheme
->load(
@_
);
}
sub
eq_tax {
my
$self
=
shift
;
my
$got
=
shift
;
my
$expect
=
shift
;
my
$classifier
=
shift
;
my
$got_taxon
=
$classifier
->classify(
$got
,
@_
);
my
$exp_taxon
=
$classifier
->classify(
$expect
,
@_
);
return
(
$got_taxon
,
$exp_taxon
)
if
wantarray
;
return
undef
unless
$got_taxon
&&
$exp_taxon
;
return
$got_taxon
eq
$exp_taxon
;
}
const
my
$CACHEDB
=>
'cachedb.bin'
;
sub
new_from_cache {
my
$class
=
shift
;
my
%args
=
@_
;
my
$tax_dir
= dir(
glob
$args
{tax_dir} );
my
$cachefile
= file(
$tax_dir
,
$CACHEDB
);
my
$tax
=
$class
->load(
$cachefile
,
inject
=> {
tax_dir
=>
$tax_dir
} );
return
$tax
;
}
sub
update_cache {
my
$self
=
shift
;
my
$cachefile
= file(
$self
->tax_dir,
$CACHEDB
);
$self
->store(
$cachefile
);
return
1;
}
sub
setup_taxdir {
my
$class
=
shift
;
my
$tax_dir
=
shift
;
my
$args
=
shift
// {};
my
$source
=
$args
->{source};
$class
->_setup_ncbi_taxdir(
$tax_dir
,
$args
)
if
$source
eq
'ncbi'
;
$class
->_setup_gtdb_taxdir(
$tax_dir
)
if
$source
eq
'gtdb'
;
return
;
}
sub
_setup_ncbi_taxdir {
my
$class
=
shift
;
my
$tax_dir
=
shift
;
my
$args
=
shift
// {};
my
$gi_mapper
=
$args
->{gi_mapper} // 0;
$tax_dir
= dir(
glob
$tax_dir
);
$tax_dir
->mkpath();
my
@targets
= (
'taxdump.tar.gz'
,
$gi_mapper
?
qw(gi_taxid_nucl.dmp.gz gi_taxid_prot.dmp.gz)
: ()
);
my
@dmpfiles
;
for
my
$target
(
@targets
) {
my
$url
=
"$base/$target"
;
my
$zipfile
= file(
$tax_dir
,
$target
)->stringify;
my
$ret_code
= getstore(
$url
,
$zipfile
);
croak
"[BMC] Error: cannot download $url: error $ret_code; aborting!"
unless
$ret_code
== 200;
if
(
$target
=~ m/\A gi_taxid/xms) {
system
(
"gzip -d $zipfile"
);
push
@dmpfiles
, change_suffix(
$zipfile
,
q{}
);
}
else
{
system
(
"tar -xzf $zipfile -C $tax_dir"
);
file(
$zipfile
)->remove;
}
}
my
$gcanamefile
= file(
$tax_dir
,
'gca0-names.dmp'
);
my
$gcanodefile
= file(
$tax_dir
,
'gca0-nodes.dmp'
);
$gcanamefile
->remove
if
-e
$gcanamefile
;
$gcanodefile
->remove
if
-e
$gcanodefile
;
$class
->_make_gca_files(
$tax_dir
);
if
( -r
$gcanamefile
&& -r
$gcanodefile
) {
}
my
$cachefile
= file(
$tax_dir
,
$CACHEDB
);
$cachefile
->remove
if
-e
$cachefile
;
if
(
$gi_mapper
) {
my
$mrgfile
= file(
$tax_dir
,
'gi_taxid_nucl_prot.dmp'
)->stringify;
system
(
"sort -nm @dmpfiles > $mrgfile"
);
my
$binfile
= change_suffix(
$mrgfile
,
'.bin'
);
new_dict(
in
=>
$mrgfile
,
out
=>
$binfile
);
file(
$_
)->remove
for
@dmpfiles
,
$mrgfile
;
}
if
( -r file(
$tax_dir
,
'names.dmp'
) && -r file(
$tax_dir
,
'nodes.dmp'
) ) {
return
1;
}
return
0;
}
sub
_make_gca_files {
my
$class
=
shift
;
my
$tax_dir
=
shift
;
const
my
$FS
=>
qq{\t|\t}
;
const
my
$FS_REGEX
=>
qr{\t \| \t}
xms;
my
%rank_for
;
my
%parent_taxid_for
;
my
$nodes
= file(
$tax_dir
,
'nodes.dmp'
);
open
my
$in
,
'<'
,
$nodes
;
while
(
my
$line
= <
$in
>) {
chomp
$line
;
my
(
$taxon_id
,
$parent_taxon_id
,
$rank
) =
$line
=~ m/^ (\d+)
$FS_REGEX
(\d+)
$FS_REGEX
([^\t]+)
$FS_REGEX
/xmsg;
$rank_for
{
$taxon_id
} =
$rank
;
$parent_taxid_for
{
$taxon_id
} =
$parent_taxon_id
;
}
my
$tax
=
$class
->new(
tax_dir
=>
$tax_dir
);
my
@targets_acc
=
qw(
assembly_summary_genbank_historical.txt
assembly_summary_genbank.txt
assembly_summary_refseq_historical.txt
assembly_summary_refseq.txt
)
;
my
%name_for
;
my
%node_for
;
my
%fix_for
;
for
my
$target
(
@targets_acc
) {
my
$url
=
"$base_acc/$target"
;
my
$file
= file(
$tax_dir
,
$target
)->stringify;
my
$ret_code
= getstore(
$url
,
$file
);
croak
"[BMC] Error: cannot download $url: error $ret_code; aborting!"
unless
$ret_code
== 200;
open
my
$fh
,
'<'
,
$file
;
LINE:
while
(
my
$line
= <
$fh
>) {
chomp
$line
;
next
LINE
if
$line
=~
$EMPTY_LINE
||
$line
=~
$COMMENT_LINE
;
my
(
$accession
,
$taxon_id
,
$species_taxon_id
)
= (
split
/\t/xms,
$line
)[0,5,6];
$taxon_id
=
$tax
->merged_for(
$taxon_id
)
if
$tax
->is_merged(
$taxon_id
);
$species_taxon_id
=
$tax
->merged_for(
$species_taxon_id
)
if
$tax
->is_merged(
$species_taxon_id
);
next
LINE
if
$tax
->is_deleted(
$taxon_id
);
if
(
$accession
=~
$GCAONLY
) {
$accession
=~
tr
/F/A/;
next
LINE
if
defined
$name_for
{
$accession
};
}
my
@taxonomy
=
$tax
->get_taxonomy(
$taxon_id
);
my
$org
=
$taxonomy
[-1];
if
(
$species_taxon_id
==
$taxon_id
) {
$species_taxon_id
=
$parent_taxid_for
{
$taxon_id
};
$fix_for
{
$taxon_id
}{name_for} =
$org
;
$fix_for
{
$taxon_id
}{node_for} =
$species_taxon_id
;
}
$name_for
{
$accession
} =
$org
;
$node_for
{
$accession
} =
$species_taxon_id
;
}
close
$fh
;
}
my
$names_gca_file
= file(
$tax_dir
,
'gca0-names.dmp'
);
open
my
$names_out
,
'>'
,
$names_gca_file
;
for
my
$accession
(
keys
%name_for
) {
say
{
$names_out
}
join
$FS
,
$accession
,
$name_for
{
$accession
},
q{}
,
'scientific name'
;
}
for
my
$taxon_id
(
keys
%fix_for
) {
say
{
$names_out
}
join
$FS
,
$taxon_id
,
$fix_for
{
$taxon_id
}{name_for},
q{}
,
'scientific name'
;
$rank_for
{
$fix_for
{
$taxon_id
}{name_for}} =
$rank_for
{
$taxon_id
};
}
my
$nodes_gca_file
= file(
$tax_dir
,
'gca0-nodes.dmp'
);
open
my
$nodes_out
,
'>'
,
$nodes_gca_file
;
for
my
$accession
(
keys
%node_for
) {
say
{
$nodes_out
}
join
$FS
,
$accession
,
$node_for
{
$accession
},
$rank_for
{
$name_for
{
$accession
}} //
'no rank'
;
}
for
my
$taxon_id
(
keys
%fix_for
) {
say
{
$nodes_out
}
join
$FS
,
$taxon_id
,
$fix_for
{
$taxon_id
}{node_for},
$rank_for
{
$taxon_id
};
}
return
;
}
sub
_setup_gtdb_taxdir {
my
$class
=
shift
;
my
$tax_dir
=
shift
;
$tax_dir
= dir(
glob
$tax_dir
);
$tax_dir
->mkpath();
my
$listing
= get(
$base
)
or croak
"[BMC] Error: cannot download $base: error $!; aborting!"
;
croak
"[BMC] Error: cannot determine path to GTDB files; aborting!"
unless
$listing
;
my
@targets
= (
grep
{ m/metadata.*\.gz/xms }
$listing
=~ m/href=
"([^"
]+?)"/xmsg,
'FILE_DESCRIPTIONS'
,
'METHODS'
,
'RELEASE_NOTES'
,
'VERSION'
);
for
my
$target
(
@targets
) {
my
$url
=
"$base/$target"
;
my
$zipfile
= file(
$tax_dir
,
$target
)->stringify;
my
$ret_code
= getstore(
$url
,
$zipfile
);
croak
"[BMC] Error: cannot download $url: error $ret_code; aborting!"
unless
$ret_code
== 200;
if
(
$target
=~ m/metadata/xms) {
system
(
"gunzip $zipfile"
);
}
}
my
$new_arcfile
= file(
$tax_dir
,
'archaea_metadata.tsv'
);
my
$new_bacfile
= file(
$tax_dir
,
'bacteria_metadata.tsv'
);
$new_arcfile
->remove
if
-e
$new_arcfile
;
$new_bacfile
->remove
if
-e
$new_bacfile
;
my
(
$arcbase
) = fileparse(
$targets
[0],
qr{\.[^.]*}
xms);
my
(
$bacbase
) = fileparse(
$targets
[1],
qr{\.[^.]*}
xms);
my
$old_arcfile
= file(
$tax_dir
,
$arcbase
);
my
$old_bacfile
= file(
$tax_dir
,
$bacbase
);
$old_arcfile
->move_to(
$new_arcfile
);
$old_bacfile
->move_to(
$new_bacfile
);
if
( -r
$new_arcfile
&& -r
$new_bacfile
) {
}
else
{
return
0;
}
my
$prok_file
= file(
$tax_dir
,
'prok_metadata.tsv'
);
$prok_file
->remove
if
-e
$prok_file
;
system
(
"cat $new_arcfile $new_bacfile > $prok_file"
);
my
$table_for
= _read_gtdb_metadata(
$prok_file
);
my
%rank_for
=
map
{
$_
eq
'superkingdom'
?
'd__'
:
substr
(
$_
, 0, 1) .
'__'
=>
$_
}
qw (superkingdom
phylum class order family genus species);
my
%tree
= (
1
=> {
rank
=>
'no rank'
,
name
=>
'root'
,
uniq_name
=>
q{}
,
children
=> {
2
=> {
rank
=>
'no rank'
,
name
=>
'cellular organisms'
,
children
=> {},
},
},
}
);
for
my
$gca
(
keys
%{
$table_for
} ) {
my
$lineage
=
$table_for
->{
$gca
}{
'gtdb_taxonomy'
};
my
@taxonomy
=
split
q{;}
,
$lineage
;
my
$tree_ref
=
$tree
{1}{children}{2}{children};
my
%count_for
= count_by {
substr
(
$_
, 3,
length
()-1 ) }
@taxonomy
;
while
(
my
$gtdb_taxon
=
shift
@taxonomy
) {
my
(
$rank_code
,
$taxon
) = (
$gtdb_taxon
) =~ m/^([a-z]__)(.*)/xms;
my
$rank
=
$rank_for
{
$rank_code
};
my
$taxon_id
=
$rank_code
.
lc
(
$taxon
=~
tr
/A-Za-z0-9//cdr);
unless
(
$tree_ref
->{
$taxon_id
} ) {
$tree_ref
->{
$taxon_id
}{name} =
$taxon
;
$tree_ref
->{
$taxon_id
}{rank} =
$rank
;
}
my
$uniq_name
=
$count_for
{
$taxon
} > 1
?
$taxon
.
' <'
.
lc
$rank
.
'>'
:
q{}
;
$tree_ref
->{
$taxon_id
}{uniq_name} =
$uniq_name
if
$uniq_name
;
if
(
$rank
eq
'species'
) {
if
(
$gca
=~ m/^GCF/xms) {
my
$new_gca
=
$gca
=~
tr
/F/A/r;
push
@{
$tree_ref
->{
$taxon_id
}{gca} },
$new_gca
;
}
push
@{
$tree_ref
->{
$taxon_id
}{gca} },
$gca
;
}
if
(
@taxonomy
) {
$tree_ref
->{
$taxon_id
}{children} //= {};
$tree_ref
=
$tree_ref
->{
$taxon_id
}{children}
}
}
}
my
$name_file
= file(
$tax_dir
,
'names.dmp'
);
my
$node_file
= file(
$tax_dir
,
'nodes.dmp'
);
open
my
$name_out
,
'>'
,
$name_file
;
open
my
$node_out
,
'>'
,
$node_file
;
my
$gcanamefile
= file(
$tax_dir
,
'gca0-names.dmp'
);
my
$gcanodefile
= file(
$tax_dir
,
'gca0-nodes.dmp'
);
open
my
$gcaname_out
,
'>'
,
$gcanamefile
;
open
my
$gcanode_out
,
'>'
,
$gcanodefile
;
file(
$tax_dir
,
'delnodes.dmp'
)->spew;
file(
$tax_dir
,
'merged.dmp'
)->spew;
_write_dmp_files(
$name_out
,
$node_out
,
$gcaname_out
,
$gcanode_out
,
1, 1,
$tree
{1} );
if
( -r
$name_file
&& -r
$node_file
) {
}
if
( -r
$gcanamefile
&& -r
$gcanodefile
) {
}
return
;
}
sub
_read_gtdb_metadata {
my
$infile
=
shift
;
open
my
$in
,
'<'
,
$infile
;
my
%table_for
;
my
@keys
;
LINE:
while
(
my
$line
= <
$in
>) {
chomp
$line
;
if
(
$line
=~ m/^accession/xms) {
(
undef
,
@keys
) =
split
/\t/xms,
$line
;
next
LINE;
}
my
(
$gca
,
@values
) =
split
/\t/xms,
$line
;
$gca
=~ s/GB_|RS_//xms;
$table_for
{
$gca
} = { mesh
@keys
,
@values
};
}
return
\
%table_for
;
}
sub
_write_dmp_files {
my
(
$name_out
,
$node_out
,
$gcaname_out
,
$gcanode_out
,
$taxon_id
,
$parent
,
$tree_ref
) =
@_
;
my
$name
=
$tree_ref
->{name };
my
$rank
=
$tree_ref
->{rank };
my
$uniq_name
=
$tree_ref
->{uniq_name} //
q{}
;
my
$gcas
=
$tree_ref
->{gca } // [];
if
(
$gcas
) {
_append2dmp(
$gcaname_out
,
$gcanode_out
,
$_
,
$parent
,
$name
,
$rank
,
$uniq_name
)
for
sort
@{
$gcas
};
}
_append2dmp(
$name_out
,
$node_out
,
$taxon_id
,
$parent
,
$name
,
$rank
,
$uniq_name
);
my
$children_for
=
$tree_ref
->{children};
return
unless
$children_for
;
$parent
=
$taxon_id
;
_write_dmp_files(
$name_out
,
$node_out
,
$gcaname_out
,
$gcanode_out
,
$_
,
$parent
,
$children_for
->{
$_
} )
for
sort
keys
%{
$children_for
};
return
;
}
sub
_append2dmp {
my
(
$name_out
,
$node_out
,
$taxon_id
,
$parent
,
$name
,
$rank
,
$uniq_name
) =
@_
;
say
{
$name_out
}
join
"\t"
,
$taxon_id
,
'|'
,
$name
,
'|'
,
$uniq_name
,
'|'
,
'scientific name'
;
say
{
$node_out
}
join
"\t"
,
$taxon_id
,
'|'
,
$parent
,
'|'
,
$rank
,
(
join
"\t"
,
'|'
,
q{}
) x 10;
return
;
}
no
Moose::Util::TypeConstraints;
__PACKAGE__->meta->make_immutable;
1;