#!/usr/bin/perl
my
$DEBUG
= 0;
$SEP
=
'_'
;
my
$evalue_filter
= 1e-3;
my
@files
;
my
$zcat
=
'zcat'
;
my
$prefix
= File::Spec->catfile(
$HOME
,
'taxonomy'
);
my
$gi2taxidfile
=
"$prefix/gi_taxid_prot.dmp.gz"
;
my
$force
= 0;
GetOptions(
'v|verbose|debug'
=> \
$DEBUG
,
'force!'
=> \
$force
,
'z|zcat:s'
=> \
$zcat
,
'i|in:s'
=> \
@files
,
'e|evalue:f'
=> \
$evalue_filter
,
't|taxonomy:s'
=> \
$prefix
,
'g|gi|gi2taxid:s'
=> \
$gi2taxidfile
,
'h|help'
=>
sub
{
system
(
'perldoc'
, $0);
exit
},
);
mkdir
(File::Spec->catfile(
$prefix
,
'idx'
))
unless
-d File::Spec->catfile(
$prefix
,
'idx'
);
my
$taxdb
= Bio::DB::Taxonomy->new
(
-source
=>
'flatfile'
,
-directory
=> File::Spec->catfile
(
$prefix
,
'idx'
),
-nodesfile
=> File::Spec->catfile(
$prefix
,
'nodes.dmp'
),
-namesfile
=> File::Spec->catfile(
$prefix
,
'names.dmp'
)
);
my
%query
;
my
(
%taxid4gi
,
%gi2node
);
my
$dbh
=
tie
(
%gi2node
,
'DB_File'
,
'gi2class'
);
my
$giidxfile
= File::Spec->catfile(
$prefix
,
'idx'
,
'gi2taxid'
);
my
$done
= -f
$giidxfile
;
$done
= 0
if
$force
;
my
$dbh2
=
$dbh
= DBI->
connect
(
"dbi:SQLite:dbname=$giidxfile"
,
""
,
""
);
if
( !
$done
) {
$dbh2
->
do
("CREATE TABLE gi2taxid ( gi integer PRIMARY KEY,
taxid integer NOT NULL)");
$dbh2
->{AutoCommit} = 0;
my
$fh
;
if
(not -f
$gi2taxidfile
) {
die
"Error: File $gi2taxidfile does not exist\n"
;
}
if
(
$gi2taxidfile
=~ /\.gz$/ ) {
open
(
$fh
,
"$zcat $gi2taxidfile |"
) ||
die
"$zcat $gi2taxidfile: $!"
;
}
else
{
open
(
$fh
,
$gi2taxidfile
) ||
die
"Error: could not read file $gi2taxidfile: $!"
;
}
my
$i
= 0;
my
$sth
=
$dbh2
->prepare(
"INSERT INTO gi2taxid (gi,taxid) VALUES (?,?)"
);
while
(<
$fh
>) {
my
(
$gi
,
$taxid
) =
split
;
$sth
->execute(
$gi
,
$taxid
);
$i
++;
if
(
$i
% 500000 == 0 ) {
$dbh
->commit;
warn
(
"$i\n"
)
if
$DEBUG
;
}
}
$dbh
->commit;
$sth
->finish;
}
for
my
$file
(
@files
) {
warn
(
"$file\n"
);
my
$gz
;
if
(
$file
=~ /\.gz$/) {
$gz
= 1;
}
my
(
$spname
) =
split
(/\./,
$file
);
my
(
$fh
,
$i
);
if
(
$gz
) {
open
(
$fh
,
"$zcat $file |"
) ||
die
"$zcat $file: $!"
;
}
else
{
open
(
$fh
,
$file
) ||
die
"$file: $!"
;
}
my
$sth
=
$dbh
->prepare(
"SELECT taxid from gi2taxid WHERE gi=?"
);
while
(<
$fh
>) {
next
if
/^\
my
(
$qname
,
$hname
,
$pid
,
$qaln
,
$mismatch
,
$gaps
,
$qstart
,
$qend
,
$hstart
,
$hend
,
$evalue
,
$bits
,
$score
) =
split
(/\t/,
$_
);
next
if
(
$evalue
>
$evalue_filter
);
if
( !
exists
$query
{
$spname
}->{
$qname
} ) {
$query
{
$spname
}->{
$qname
} = {};
}
if
(
$hname
=~ /gi\|(\d+)/) {
my
$gi
= $1;
if
( !
$gi2node
{
$gi
} ){
$sth
->execute(
$gi
);
my
$taxid
;
$sth
->bind_columns(\
$taxid
);
if
( !
$sth
->fetch ) {
warn
(
"no taxid for $gi\n"
);
next
;
}
my
$node
=
$taxdb
->get_Taxonomy_Node(
$taxid
);
if
( !
$node
) {
warn
(
"cannot find node for gi=$gi ($hname) (taxid=$taxid)\n"
);
next
;
}
my
$parent
=
$taxdb
->get_Taxonomy_Node(
$node
->parent_id);
while
(
defined
$parent
&&
$parent
->node_name ne
'root'
) {
if
(
$parent
->rank eq
'kingdom'
) {
(
$gi2node
{
$gi
}) =
$parent
->node_name;
last
;
}
elsif
(
$parent
->rank eq
'superkingdom'
) {
(
$gi2node
{
$gi
}) =
$parent
->node_name;
$gi2node
{
$gi
} =~ s/ \<(bacteria|archaea)\>//g;
last
;
}
$parent
=
$taxdb
->get_Taxonomy_Node(
$parent
->parent_id);
}
}
my
(
$kingdom
) =
$gi2node
{
$gi
};
unless
(
defined
$kingdom
&&
length
(
$kingdom
) ) {
}
else
{
$query
{
$spname
}->{
$qname
}->{
$kingdom
}++;
}
}
else
{
warn
(
"no GI in $hname\n"
);
}
}
last
if
(
$DEBUG
&&
$i
++ > 10000);
$sth
->finish;
}
while
(
my
(
$sp
,
$d
) =
each
%query
) {
my
$total
=
scalar
keys
%$d
;
print
"$sp total=$total\n"
;
my
%seen
;
for
my
$v
(
values
%$d
) {
my
$tag
=
join
(
","
,
sort
keys
%$v
);
$seen
{
$tag
}++;
}
for
my
$t
(
sort
{
$seen
{
$a
} <=>
$seen
{
$b
} }
keys
%seen
) {
printf
" %-20s\t%d\t%.2f%%\n"
,
$t
,
$seen
{
$t
}, 100 *
$seen
{
$t
} /
$total
;
}
print
"\n\n"
;
}