#!/usr/bin/perl
use
YAML
qw(Dump LoadFile DumpFile)
;
use
vars
qw/$split @filter $zip $outdir $help $ethresh
$ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
$CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
$file @files $dir $summary $nolump
$source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
$gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/
;
use
constant
ALPHABET
=> [
qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)
];
a
=> 0,
b
=> 1,
c
=> 2,
d
=> 3,
e
=> 4,
f
=> 5,
g
=> 6,
h
=> 7,
i
=> 8,
j
=> 9,
k
=> 10,
l
=> 11,
m
=> 12,
n
=> 13,
o
=> 14,
p
=> 15,
q =>
16,
r
=> 17,
s
=> 18,
t
=> 19,
u
=> 20,
v
=> 21,
w
=> 22,
x
=> 23,
y
=> 24,
z
=> 25,
};
$GFF_VERSION
= 3;
$verbose
= 1;
my
$CDSkeep
= 1;
my
$PROTEIN_TYPE
=
'polypeptide'
;
my
$FORMAT
=
"GenBank"
;
my
$SOURCEID
=
$FORMAT
;
my
%TAG_MAP
= (
db_xref
=>
'Dbxref'
,
name
=>
'Name'
,
note
=>
'Note'
,
synonym
=>
'Alias'
,
symbol
=>
'Alias'
,
);
$| = 1;
my
$quiet
= !
$verbose
;
my
$ok
= GetOptions(
'd|dir|input:s'
=> \
$dir
,
'z|zip'
=> \
$zip
,
'h|help'
=> \
$help
,
's|summary'
=> \
$summary
,
'r|noinfer'
=> \
$noinfer
,
'i|conf=s'
=> \
$CONF
,
'sofile=s'
=> \
$SO_FILE
,
'm|manual'
=> \
$MANUAL
,
'o|outdir|output:s'
=> \
$outdir
,
'x|filter:s'
=> \
@filter
,
'y|split'
=> \
$split
,
"ethresh|e=s"
=>\
$ethresh
,
'c|CDS!'
=> \
$CDSkeep
,
'f|format=s'
=> \
$FORMAT
,
'typesource=s'
=> \
$source_type
,
'GFF_VERSION=s'
=> \
$GFF_VERSION
,
'quiet!'
=> \
$quiet
,
'DEBUG!'
=> \
$DEBUG
,
'n|nolump'
=> \
$nolump
);
my
$lump
= 1
unless
$nolump
||
$split
;
$verbose
= !
$quiet
;
pod2usage(2)
if
$help
|| !
$ok
;
$SOURCEID
=
$FORMAT
;
$FORMAT
=
"swiss"
if
$FORMAT
=~/UniProt|trembl/;
$verbose
=1
if
(
$DEBUG
);
my
$unflattener
= Bio::SeqFeature::Tools::Unflattener->new;
$unflattener
->error_threshold(
$ethresh
)
if
$ethresh
;
$unflattener
->verbose(1)
if
(
$DEBUG
);
my
$tm
= Bio::SeqFeature::Tools::TypeMapper->new;
my
$idh
= Bio::SeqFeature::Tools::IDHandler->new;
$source_type
||=
"region"
;
my
$FTSOmap
;
my
$FTSOsynonyms
;
if
(
defined
(
$SO_FILE
) &&
$SO_FILE
eq
'live'
) {
print
"\nDownloading the latest SO file from "
.SO_URL.
"\n\n"
;
my
$ua
= LWP::UserAgent->new(
timeout
=> 30);
my
$request
= HTTP::Request->new(
GET
=> SO_URL);
my
$response
=
$ua
->request(
$request
);
if
(
$response
->status_line =~ /200/) {
my
(
$fh
,
$fn
) = tempfile();
print
$fh
$response
->content;
$SO_FILE
=
$fn
;
}
else
{
print
"Couldn't download SO file online...skipping validation.\n"
.
"HTTP Status was "
.
$response
->status_line .
"\n"
and
undef
$SO_FILE
}
}
if
(
$SO_FILE
) {
my
(
%terms
,
%syn
);
my
$parser
= Bio::OntologyIO->new(
-format
=>
"obo"
,
-file
=>
$SO_FILE
);
$ONTOLOGY
=
$parser
->next_ontology();
for
(
$ONTOLOGY
->get_all_terms) {
my
$feat
=
$_
;
$terms
{
$feat
->name} =
$feat
->name;
my
@syn
=
$_
->each_synonym;
push
@{
$syn
{
$_
}},
$feat
->name
for
@syn
;
}
$FTSOmap
= \
%terms
;
$FTSOsynonyms
= \
%syn
;
my
%hardTerms
= %{
$tm
->FT_SO_map() };
map
{
$FTSOmap
->{
$_
} ||=
$hardTerms
{
$_
} }
keys
%hardTerms
;
}
else
{
my
%terms
= %{
$tm
->FT_SO_map() };
while
(
my
(
$k
,
$v
) =
each
%terms
) {
$FTSOmap
->{
$k
} =
ref
(
$v
) ?
shift
@$v
:
$v
;
}
}
$TYPE_MAP
=
$FTSOmap
;
$SYN_MAP
=
$FTSOsynonyms
;
my
$filter
=
join
' '
,
@filter
if
@filter
;
my
$stdin
=0;
if
(
$dir
&& (
$dir
eq
'-'
||
$dir
eq
'stdin'
)) {
$stdin
=1;
$dir
=
''
;
@files
=(
'stdin'
);
}
elsif
(
$dir
) {
if
( -d
$dir
) {
opendir
DIR,
$dir
or
die
"could not open $dir for reading: $!"
;
@files
=
map
{
"$dir/$_"
;}
grep
{ /\.gb.*/ }
readdir
DIR;
closedir
DIR;
}
else
{
die
"$dir is not a directory\n"
;
}
}
else
{
@files
=
@ARGV
;
$dir
=
''
;
}
pod2usage(2)
unless
@files
;
my
$stdout
=0;
if
(
$outdir
&& (
$outdir
eq
'-'
||
$outdir
eq
'stdout'
)) {
warn
(
"std. output chosen: cannot split\n"
)
if
(
$split
);
warn
(
"std. output chosen: cannot zip\n"
)
if
(
$zip
);
warn
(
"std. output chosen: cannot nolump\n"
)
if
(
$nolump
);
$stdout
=1;
$lump
=1;
$split
= 0;
$zip
= 0;
}
elsif
(
$outdir
&& !-e
$outdir
) {
mkdir
(
$outdir
) or
die
"could not create directory $outdir: $!\n"
;
}
elsif
( !
$outdir
) {
$outdir
=
$dir
||
'.'
;
}
for
my
$file
(
@files
) {
chomp
$file
;
die
"$! $file"
unless
(
$stdin
|| -e
$file
);
print
"# Input: $file\n"
if
(
$verbose
);
my
(
$lump_fh
,
$lumpfa_fh
,
$outfile
,
$outfa
);
if
(
$stdout
) {
$lump_fh
=
*STDOUT
;
$lump
=
"stdout$$"
;
$outfa
=
"stdout$$.fa"
;
open
$lumpfa_fh
,
">$outfa"
or
die
"Could not create a lump outfile called ($outfa) because ($!)\n"
;
}
elsif
(
$lump
) {
my
(
$vol
,
$dirs
,
$fileonly
) = File::Spec->splitpath(
$file
);
$lump
= File::Spec->catfile(
$outdir
,
$fileonly
.
'.gff'
);
(
$outfa
=
$lump
) =~ s/\.gff/\.fa/;
open
$lump_fh
,
">$lump"
or
die
"Could not create a lump outfile called ($lump) because ($!)\n"
;
open
$lumpfa_fh
,
">$outfa"
or
die
"Could not create a lump outfile called ($outfa) because ($!)\n"
;
}
if
(
$stdin
) {
*FH
=
*STDIN
;
}
elsif
(
$file
=~ /\.gz/ ) {
open
FH,
"gunzip -c $file |"
;
}
else
{
open
FH,
'<'
,
$file
;
}
my
$in
= Bio::SeqIO->new(
-fh
=> \
*FH
,
-format
=>
$FORMAT
,
-debug
=>
$DEBUG
);
my
$gffio
= Bio::Tools::GFF->new(
-noparse
=> 1,
-gff_version
=>
$GFF_VERSION
);
while
(
my
$seq
=
$in
->next_seq() ) {
my
$seq_name
=
$seq
->accession_number;
my
$end
=
$seq
->
length
;
my
@to_print
;
$outfile
=
$lump
|| File::Spec->catfile(
$outdir
,
$seq_name
.
'.gff'
);
my
$out
;
if
(
$lump
) {
$outfile
=
$lump
;
$out
=
$lump_fh
;
}
else
{
$outfile
= File::Spec->catfile(
$outdir
,
$seq_name
.
'.gff'
);
open
$out
,
">$outfile"
;
}
my
$source_feat
=
undef
;
my
@source
= filter(
$seq
);
$source_feat
=
$source
[0];
(
$source_type
,
$source_feat
)=
getSourceInfo(
$seq
,
$source_type
,
$source_feat
) ;
warn
"$seq_name has no features, skipping\n"
and
next
if
!
$seq
->all_SeqFeatures;
$FTSOmap
->{
'source'
} =
$source_type
;
my
(
$header
,
$info
)= gff_header(
$seq_name
,
$end
,
$source_type
,
$source_feat
);
print
$out
$header
;
print
"# working on $info\n"
if
(
$verbose
);
unflatten_seq(
$seq
);
@GFF_LINE_FEAT
= ();
for
my
$feature
( get_all_SeqFeatures(
$seq
) ) {
my
$method
=
$feature
->primary_tag;
next
if
(
$SOURCEID
=~/UniProt|swiss|trembl/i &&
$method
ne
$source_type
);
$feature
->seq_id(
$seq
->id)
unless
(
$feature
->seq_id);
$feature
->source_tag(
$SOURCEID
);
maptags2gff(
$feature
);
my
$gene_name
;
if
(
$method
eq
'gene'
||
$method
eq
'pseudogene'
) {
@to_print
= print_held(
$out
,
$gffio
, \
@to_print
);
$gene_id
=
$gene_name
= gene_name(
$feature
);
}
else
{
$gene_name
= gene_name(
$feature
);
}
convert_to_name(
$feature
);
warn
"#at: $method $gene_id/$gene_name\n"
if
$DEBUG
;
if
(
$method
=~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
|| (
$gene_id
&&
$gene_name
eq
$gene_id
) ) {
my
$action
= gene_features(
$feature
,
$gene_id
,
$gene_name
);
if
(
$action
== GM_DUP_PART) {
}
elsif
(
$action
== GM_NOT_PART) {
add_generic_id(
$feature
,
$gene_name
,
"nocount"
);
my
$gff
=
$gffio
->gff_string(
$feature
);
push
@GFF_LINE_FEAT
,
$feature
;
}
elsif
(
$action
> 0) {
@to_print
= print_held(
$out
,
$gffio
, \
@to_print
)
if
(
$action
== GM_NEW_TOPLEVEL);
push
(
@to_print
,
$feature
);
}
}
else
{
add_generic_id(
$feature
,
$gene_name
,
""
);
my
$gff
=
$gffio
->gff_string(
$feature
);
push
@GFF_LINE_FEAT
,
$feature
;
}
}
@to_print
= print_held(
$out
,
$gffio
, \
@to_print
);
gff_validate(
@GFF_LINE_FEAT
);
for
my
$feature
(
@GFF_LINE_FEAT
) {
my
$gff
=
$gffio
->gff_string(
$feature
);
print
$out
"$gff\n"
if
$gff
;
}
my
(
$fa_out
,
$fa_outfile
);
my
$dna
=
$seq
->seq;
if
(
$dna
||
%proteinfa
) {
$method
{
'RESIDUES'
} +=
length
(
$dna
);
$dna
=~ s/(\S{60})/$1\n/g;
$dna
.=
"\n"
;
if
(
$split
) {
$fa_outfile
=
$outfile
;
$fa_outfile
=~ s/gff$/fa/;
open
$fa_out
,
">$fa_outfile"
or
die
$!;
print
$fa_out
">$seq_name\n$dna"
if
$dna
;
foreach
my
$aid
(
sort
keys
%proteinfa
) {
my
$aa
=
delete
$proteinfa
{
$aid
};
$method
{
'RESIDUES(tr)'
} +=
length
(
$aa
);
$aa
=~ s/(\S{60})/$1\n/g;
print
$fa_out
">$aid\n$aa\n"
;
}
}
else
{
print
$lumpfa_fh
">$seq_name\n$dna"
if
$dna
;
foreach
my
$aid
(
sort
keys
%proteinfa
) {
my
$aa
=
delete
$proteinfa
{
$aid
};
$method
{
'RESIDUES(tr)'
} +=
length
(
$aa
);
$aa
=~ s/(\S{60})/$1\n/g;
print
$lumpfa_fh
">$aid\n$aa\n"
;
}
}
%proteinfa
=();
}
if
(
$zip
&& !
$lump
) {
system
"gzip -f $outfile"
;
system
"gzip -f $fa_outfile"
if
(
$fa_outfile
);
$outfile
.=
'.gz'
;
$fa_outfile
.=
'.gz'
if
$split
;
}
print
"# GFF3 saved to $outfile"
unless
( !
$verbose
||
$stdout
||
$lump
);
print
(
$split
?
"; DNA saved to $fa_outfile\n"
:
"\n"
)
unless
(
$stdout
||
$lump
);
}
print
"# GFF3 saved to $outfile\n"
if
(
$verbose
&&
$lump
);
if
(
$summary
) {
print
"# Summary:\n# Feature\tCount\n# -------\t-----\n"
;
for
(
keys
%method
) {
print
"# $_ $method{$_}\n"
;
}
print
"# \n"
;
}
close
(
$lumpfa_fh
)
if
$lumpfa_fh
;
if
(!
$split
&&
$outfa
&&
$lump_fh
) {
print
$lump_fh
"##FASTA\n"
; # GFF3 spec
open
$lumpfa_fh
,
$outfa
or
warn
"reading FA $outfa: $!"
;
while
( <
$lumpfa_fh
>) {
print
$lump_fh
$_
;
}
close
(
$lumpfa_fh
);
unlink
(
$outfa
);
}
if
(
$zip
&&
$lump
) {
system
"gzip -f $lump"
;
}
close
FH;
}
sub
typeorder {
return
1
if
(
$_
[0] =~ /gene/);
return
2
if
(
$_
[0] =~ /RNA|transcript/);
return
3
if
(
$_
[0] =~ /protein|peptide/);
return
4
if
(
$_
[0] =~ /exon|CDS/);
return
3;
}
sub
sort_by_feattype {
my
(
$at
,
$bt
)= (
$a
->primary_tag,
$b
->primary_tag);
return
(typeorder(
$at
) <=> typeorder(
$bt
))
|| (
$at
cmp
$bt
);
}
sub
print_held {
my
(
$out
,
$gffio
,
$to_print
)=
@_
;
return
unless
(
@$to_print
);
@$to_print
=
sort
sort_by_feattype
@$to_print
;
while
(
my
$feature
=
shift
@$to_print
) {
my
$gff
=
$gffio
->gff_string(
$feature
);
push
@GFF_LINE_FEAT
,
$feature
;
}
return
();
}
sub
maptags2gff {
my
$f
=
shift
;
foreach
my
$tag
(
keys
%TAG_MAP
) {
if
(
$f
->has_tag(
$tag
)) {
my
$newtag
=
$TAG_MAP
{
$tag
};
my
@v
=
$f
->get_tag_values(
$tag
);
$f
->remove_tag(
$tag
);
$f
->add_tag_value(
$newtag
,
@v
);
if
(
$tag
eq
'note'
) {
map
{ s/\[goid (\d+)/\[goid GO:$1/g; }
@v
;
my
@go
=
map
{ m/(GO:\d+)/g }
@v
;
$f
->add_tag_value(
'Ontology_term'
,
@go
)
if
(
@go
);
}
}
}
}
sub
getSourceInfo {
my
(
$seq
,
$source_type
,
$sf
) =
@_
;
my
$is_swiss
= (
$SOURCEID
=~/UniProt|swiss|trembl/i);
my
$is_gene
= (
$SOURCEID
=~/entrezgene/i);
my
$is_rich
= (
ref
(
$seq
) =~ /RichSeq/);
my
$seq_name
=
$seq
->accession_number();
unless
(
$sf
) {
$source_type
=
$is_swiss
?
$PROTEIN_TYPE
:
$is_gene
?
"eneg"
:
$is_rich
?
$seq
->molecule :
$source_type
;
$sf
= Bio::SeqFeature::Generic->direct_new();
my
$len
=
$seq
->
length
();
$len
=1
if
(
$len
<1);
my
$start
= 1;
my
$loc
=
$seq
->can(
'location'
) ?
$seq
->location()
: new Bio::Location::Simple(
-start
=>
$start
,
-end
=>
$len
);
$sf
->location(
$loc
);
$sf
->primary_tag(
$source_type
);
$sf
->source_tag(
$SOURCEID
);
$sf
->seq_id(
$seq_name
);
$sf
->add_tag_value(
Alias
=>
$seq
->id());
$seq
->add_SeqFeature(
$sf
);
}
if
(
$sf
->has_tag(
"chromosome"
)) {
$source_type
=
"chromosome"
;
my
(
$chrname
) =
$sf
->get_tag_values(
"chromosome"
);
$sf
->add_tag_value(
Alias
=>
$chrname
);
}
warn
"# $SOURCEID:$seq_name fields = "
,
join
(
","
,
$seq
->annotation->get_all_annotation_keys()),
"\n"
if
$DEBUG
;
my
%AnnotTagMap
= (
'gene_name'
=>
'Alias'
,
'ALIAS_SYMBOL'
=>
'Alias'
,
'LOCUS_SYNONYM'
=>
'Alias'
,
'symbol'
=>
'Alias'
,
'synonym'
=>
'Alias'
,
'dblink'
=>
'Dbxref'
,
'product'
=>
'product'
,
'Reference'
=>
'reference'
,
'OntologyTerm'
=>
'Ontology_term'
,
'comment'
=>
'Note'
,
'comment1'
=>
'Note'
,
);
my
(
$desc
)=
$seq
->annotation->get_Annotations(
"desc"
) || (
$seq
->desc() );
my
(
$date
)=
$seq
->annotation->get_Annotations(
"dates"
)
||
$seq
->annotation->get_Annotations(
"update-date"
)
||
$is_rich
?
$seq
->get_dates() : ();
my
(
$comment
)=
$seq
->annotation->get_Annotations(
"comment"
);
my
(
$species
)=
$seq
->annotation->get_Annotations(
"species"
);
if
(!
$species
&&
$seq
->can(
'species'
)
&&
defined
$seq
->species()
&&
$seq
->species()->can(
'binomial'
) ) {
$species
=
$seq
->species()->binomial();
}
$sf
->add_tag_value(
ID
=>
$seq_name
)
unless
$sf
->has_tag(
'ID'
);
$sf
->add_tag_value(
Note
=>
$desc
)
if
(
$desc
&& !
$sf
->has_tag(
'Note'
));
$sf
->add_tag_value(
organism
=>
$species
)
if
(
$species
&& !
$sf
->has_tag(
'organism'
));
$sf
->add_tag_value(
comment1
=>
$comment
)
if
(!
$is_swiss
&&
$comment
&& !
$sf
->has_tag(
'comment1'
));
$sf
->add_tag_value(
date
=>
$date
)
if
(
$date
&& !
$sf
->has_tag(
'date'
));
$sf
->add_tag_value(
Dbxref
=>
$SOURCEID
.
':'
.
$seq_name
)
if
$is_swiss
||
$is_gene
;
foreach
my
$atag
(
sort
keys
%AnnotTagMap
) {
my
$gtag
=
$AnnotTagMap
{
$atag
};
next
unless
(
$gtag
);
my
@anno
=
map
{
if
(
ref
$_
&&
$_
->can(
'get_all_values'
)) {
split
( /[,;] */,
join
";"
,
$_
->get_all_values)
}
elsif
(
ref
$_
&&
$_
->can(
'display_text'
)) {
split
( /[,;] */,
$_
->display_text)
}
elsif
(
ref
$_
&&
$_
->can(
'value'
)) {
split
( /[,;] */,
$_
->value)
}
else
{
();
}
}
$seq
->annotation->get_Annotations(
$atag
);
foreach
(
@anno
) {
$sf
->add_tag_value(
$gtag
=>
$_
); }
}
return
(
wantarray
)? (
$source_type
,
$sf
) :
$source_type
;
}
sub
gene_features {
my
(
$f
,
$gene_id
,
$genelinkID
) =
@_
;
local
$_
=
$f
->primary_tag;
$method
{
$_
}++;
if
( /gene/ ) {
$f
->add_tag_value(
ID
=>
$gene_id
)
unless
(
$f
->has_tag(
'ID'
));
$tnum
=
$rnum
= 0;
$ncrna_id
=
$rna_id
=
''
;
return
GM_NEW_TOPLEVEL;
}
elsif
( /mRNA/ ) {
return
GM_NOT_PART
unless
$gene_id
;
return
GM_NOT_PART
if
(
$genelinkID
&&
$genelinkID
ne
$gene_id
);
(
$rna_id
=
$gene_id
) =~ s/gene/mRNA/;
$rna_id
.=
'.t0'
. ++
$tnum
;
$f
->add_tag_value(
ID
=>
$rna_id
);
$f
->add_tag_value(
Parent
=>
$gene_id
);
}
elsif
( /RNA|transcript/) {
if
(
$gene_id
) {
return
GM_NOT_PART
if
(
$genelinkID
&&
$genelinkID
ne
$gene_id
);
(
$ncrna_id
=
$gene_id
) =~ s/gene/ncRNA/;
$ncrna_id
.=
'.r0'
. ++
$rnum
;
$f
->add_tag_value(
Parent
=>
$gene_id
);
$f
->add_tag_value(
ID
=>
$ncrna_id
);
}
else
{
unless
(
$f
->has_tag(
'ID'
)) {
if
(
$genelinkID
) {
$f
->add_tag_value(
ID
=>
$genelinkID
) ;
}
else
{
$idh
->generate_unique_persistent_id(
$f
);
}
}
(
$ncrna_id
)=
$f
->get_tag_values(
'ID'
);
return
GM_NEW_TOPLEVEL;
}
}
elsif
( /exon/ ) {
return
GM_NOT_PART
unless
(
$rna_id
||
$ncrna_id
);
return
GM_NOT_PART
if
(
$genelinkID
&&
$genelinkID
ne
$gene_id
);
for
my
$expar
(
$rna_id
,
$ncrna_id
) {
next
unless
(
$expar
);
if
(
$exonpar
{
$expar
} and
$f
->has_tag(
'Parent'
) ) {
my
@vals
=
$f
->get_tag_values(
'Parent'
);
next
if
(
grep
{
$expar
eq
$_
}
@vals
);
}
$exonpar
{
$expar
}++;
$f
->add_tag_value(
Parent
=>
$expar
);
}
return
GM_DUP_PART
if
++
$seen
{
$f
} > 1;
}
elsif
( /CDS|protein|polypeptide/ ) {
return
GM_NOT_PART
unless
$rna_id
;
return
GM_NOT_PART
if
(
$genelinkID
&&
$genelinkID
ne
$gene_id
);
(
my
$pro_id
=
$rna_id
) =~ s/\.t/\.p/;
if
( !
$CDSkeep
&& /CDS/) {
$f
->primary_tag(
$PROTEIN_TYPE
);
if
(
$f
->location->isa(
"Bio::Location::SplitLocationI"
)) {
my
(
$b
,
$e
)=(-1,0);
foreach
my
$l
(
$f
->location->each_Location) {
$b
=
$l
->start
if
(
$b
<0 ||
$b
>
$l
->start);
$e
=
$l
->end
if
(
$e
<
$l
->end);
}
$f
->location( Bio::Location::Simple->new(
-start
=>
$b
,
-end
=>
$e
,
-strand
=>
$f
->strand) );
}
$f
->add_tag_value(
Derives_from
=>
$rna_id
);
}
else
{
$f
->add_tag_value(
Parent
=>
$rna_id
);
}
$f
->add_tag_value(
ID
=>
$pro_id
);
move_translation_fasta(
$f
,
$pro_id
);
}
elsif
( /region/ ) {
$f
->primary_tag(
'gene_component_region'
);
$f
->add_tag_value(
Parent
=>
$gene_id
);
}
else
{
return
GM_NOT_PART
unless
$gene_id
;
$f
->add_tag_value(
Parent
=>
$gene_id
);
}
return
GM_NEW_PART;
}
sub
add_generic_id {
my
(
$f
,
$ft_name
,
$flags
) =
@_
;
my
$method
=
$f
->primary_tag;
$method
{
$method
}++
unless
(
$flags
=~ /nocount/);
if
(
$f
->has_tag(
'ID'
)) {
}
elsif
(
$f
->has_tag(
$method
) ) {
my
(
$name
) =
$f
->get_tag_values(
$method
);
$f
->add_tag_value(
ID
=>
"$method:$name"
);
}
elsif
(
$ft_name
) {
$f
->add_tag_value(
ID
=>
$ft_name
);
}
else
{
$idh
->generate_unique_persistent_id(
$f
);
}
move_translation_fasta(
$f
, (
$f
->get_tag_values(
"ID"
))[0] )
if
(
$method
=~ /CDS/);
}
sub
move_translation_fasta {
my
(
$f
,
$ft_id
) =
@_
;
if
(
$ft_id
&&
$f
->has_tag(
'translation'
) ) {
my
(
$aa
) =
$f
->get_tag_values(
"translation"
);
if
(
$aa
&&
$aa
!~ /^
length
/) {
$proteinfa
{
$ft_id
}=
$aa
;
$f
->remove_tag(
"translation"
);
$f
->add_tag_value(
"translation"
,
"length."
.
length
(
$aa
));
}
}
}
sub
gff_header {
my
(
$name
,
$end
,
$source_type
,
$source_feat
) =
@_
;
$source_type
||=
"region"
;
my
$info
=
"$source_type:$name"
;
my
$head
=
"##gff-version $GFF_VERSION\n"
.
"##sequence-region $name 1 $end\n"
.
"# conversion-by bp_genbank2gff3.pl\n"
;
if
(
$source_feat
) {
for
my
$key
(
qw(organism Note date)
) {
my
$value
;
if
(
$source_feat
->has_tag(
$key
)) {
(
$value
) =
$source_feat
->get_tag_values(
$key
);
}
if
(
$value
) {
$head
.=
"# $key $value\n"
;
$info
.=
", $value"
;
}
}
$head
=
""
if
$didheader
;
}
else
{
$head
.=
"$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n"
;
}
$didheader
++;
return
(
wantarray
) ? (
$head
,
$info
) :
$head
;
}
sub
unflatten_seq {
my
$seq
=
shift
;
my
$uh_oh
=
"Possible gene unflattening error with"
.
$seq
->accession_number .
": consult STDERR\n"
;
eval
{
$unflattener
->unflatten_seq(
-seq
=>
$seq
,
-noinfer
=>
$noinfer
,
-use_magic
=> 1 );
};
if
( $@ ) {
warn
$seq
->accession_number .
" Unflattening error:\n"
;
warn
"Details: $@\n"
;
print
"# "
.
$uh_oh
;
}
return
0
if
!
$seq
|| !
$seq
->all_SeqFeatures;
map_types(
$tm
,
-seq
=>
$seq
,
-type_map
=>
$FTSOmap
,
-syn_map
=>
$FTSOsynonyms
,
-undefined
=>
"region"
);
}
sub
filter {
my
$seq
=
shift
;
my
@feats
;
my
@sources
;
if
(
$filter
) {
for
my
$f
(
$seq
->remove_SeqFeatures ) {
my
$m
=
$f
->primary_tag;
push
@sources
,
$f
if
(
$m
eq
'source'
);
push
@feats
,
$f
unless
$filter
=~ /
$m
/i;
}
$seq
->add_SeqFeature(
$_
)
foreach
@feats
;
}
else
{
for
my
$f
(
$seq
->get_SeqFeatures ){
my
$m
=
$f
->primary_tag;
push
@sources
,
$f
if
(
$m
eq
'source'
);
}
}
return
@sources
;
}
sub
get_all_SeqFeatures {
my
$seq
=
shift
;
my
@flatarr
;
foreach
my
$feat
(
$seq
->get_SeqFeatures ){
push
(
@flatarr
,
$feat
);
_add_flattened_SeqFeatures(\
@flatarr
,
$feat
);
}
return
@flatarr
;
}
sub
gene_name {
my
$g
=
shift
;
my
$gene_id
=
''
;
if
(
$g
->has_tag(
'locus_tag'
)) {
(
$gene_id
) =
$g
->get_tag_values(
'locus_tag'
);
}
elsif
(
$g
->has_tag(
'gene'
)) {
(
$gene_id
) =
$g
->get_tag_values(
'gene'
);
}
elsif
(
$g
->has_tag(
'ID'
)) {
(
$gene_id
) =
$g
->get_tag_values(
'ID'
);
}
elsif
(
$g
->has_tag(
'product'
)) {
my
(
$name
)=
$g
->get_tag_values(
'product'
);
(
$gene_id
) =
$name
unless
(
$name
=~ / /);
}
elsif
(
$g
->has_tag(
'transposon'
)) {
my
(
$name
)=
$g
->get_tag_values(
'transposon'
);
(
$gene_id
) =
$name
unless
(
$name
=~ / /);
}
return
$gene_id
;
}
sub
convert_to_name {
my
$g
=
shift
;
my
$gene_id
=
''
;
if
(
$g
->has_tag(
'gene'
)) {
(
$gene_id
) =
$g
->get_tag_values(
'gene'
);
$g
->remove_tag(
'gene'
);
$g
->add_tag_value(
'Name'
,
$gene_id
);
}
elsif
(
$g
->has_tag(
'locus_tag'
)) {
(
$gene_id
) =
$g
->get_tag_values(
'locus_tag'
);
$g
->remove_tag(
'locus_tag'
);
$g
->add_tag_value(
'Name'
,
$gene_id
);
}
elsif
(
$g
->has_tag(
'product'
)) {
my
(
$name
)=
$g
->get_tag_values(
'product'
);
(
$gene_id
) =
$name
unless
(
$name
=~ / /);
$g
->add_tag_value(
'Name'
,
$gene_id
);
}
elsif
(
$g
->has_tag(
'transposon'
)) {
my
(
$name
)=
$g
->get_tag_values(
'transposon'
);
(
$gene_id
) =
$name
unless
(
$name
=~ / /);
$g
->add_tag_value(
'Name'
,
$gene_id
);
}
elsif
(
$g
->has_tag(
'ID'
)) {
my
(
$name
)=
$g
->get_tag_values(
'ID'
);
$g
->add_tag_value(
'Name'
,
$name
);
}
return
$gene_id
;
}
sub
_add_flattened_SeqFeatures {
my
(
$arrayref
,
$feat
) =
@_
;
my
@subs
= ();
if
(
$feat
->isa(
"Bio::FeatureHolderI"
)) {
@subs
=
$feat
->get_SeqFeatures;
}
elsif
(
$feat
->isa(
"Bio::SeqFeatureI"
)) {
@subs
=
$feat
->sub_SeqFeature;
}
else
{
warn
ref
(
$feat
).
" is neither a FeatureHolderI nor a SeqFeatureI. "
.
"Don't know how to flatten."
;
}
for
my
$sub
(
@subs
) {
push
(
@$arrayref
,
$sub
);
_add_flattened_SeqFeatures(
$arrayref
,
$sub
);
}
}
sub
map_types {
my
(
$self
,
@args
) =
@_
;
my
(
$sf
,
$seq
,
$type_map
,
$syn_map
,
$undefmap
) =
$self
->_rearrange([
qw(FEATURE
SEQ
TYPE_MAP
SYN_MAP
UNDEFINED
)
],
@args
);
if
(!
$sf
&& !
$seq
) {
$self
->throw(
"you need to pass in either -feature or -seq"
);
}
my
@sfs
= (
$sf
);
if
(
$seq
) {
$seq
->isa(
"Bio::SeqI"
) ||
$self
->throw(
"$seq NOT A SeqI"
);
@sfs
=
$seq
->get_all_SeqFeatures;
}
$type_map
=
$type_map
||
$self
->typemap;
foreach
my
$feat
(
@sfs
) {
$feat
->isa(
"Bio::SeqFeatureI"
) ||
$self
->throw(
"$feat NOT A SeqFeatureI"
);
$feat
->isa(
"Bio::FeatureHolderI"
) ||
$self
->throw(
"$feat NOT A FeatureHolderI"
);
my
$primary_tag
=
$feat
->primary_tag;
my
$mtype
=
$type_map
->{
$primary_tag
};
if
(
$mtype
) {
if
(
ref
(
$mtype
)) {
if
(
ref
(
$mtype
) eq
'ARRAY'
) {
my
$soID
;
(
$mtype
,
$soID
) =
@$mtype
;
if
(
$soID
&&
ref
(
$ONTOLOGY
)) {
my
(
$term
) =
$ONTOLOGY
->find_terms(
-identifier
=>
$soID
);
$mtype
=
$term
->name
if
$term
;
}
elsif
(!
defined
$soID
&&
ref
(
$ONTOLOGY
)) {
undef
$mtype
;
delete
$type_map
->{
$primary_tag
};
}
elsif
(
$undefmap
&&
$mtype
eq
'undefined'
) {
$mtype
=
$undefmap
;
}
$type_map
->{
$primary_tag
} =
$mtype
if
$mtype
;
}
elsif
(
ref
(
$mtype
) eq
'CODE'
) {
$mtype
=
$mtype
->(
$feat
);
}
else
{
$self
->throw(
'must be scalar or CODE ref'
);
}
}
elsif
(
$undefmap
&&
$mtype
eq
'undefined'
) {
$mtype
=
$undefmap
;
}
$feat
->primary_tag(
$mtype
);
}
if
(
$CONF
) {
conf_read();
my
%perfect_matches
;
while
(
my
(
$p_tag
,
$rules
) =
each
%$YAML
) {
RULE:
for
my
$rule
(
@$rules
) {
for
my
$tags
(
@$rule
) {
while
(
my
(
$tag
,
$values
) =
each
%$tags
) {
for
my
$value
(
@$values
) {
if
(
$feat
->has_tag(
$tag
)) {
for
(
$feat
->get_tag_values(
$tag
)) {
next
RULE
unless
$_
=~ /\Q
$value
\E/;
}
}
elsif
(
$tag
eq
'primary_tag'
) {
next
RULE
unless
$value
eq
$feat
->primary_tag;
}
elsif
(
$tag
eq
'location'
) {
next
RULE
unless
$value
eq
$feat
->start.
'..'
.
$feat
->end;
}
else
{
next
RULE }
}
}
}
$perfect_matches
{
$p_tag
}++;
}
}
if
(
scalar
(
keys
%perfect_matches
) == 1) {
$mtype
=
$_
for
keys
%perfect_matches
;
}
elsif
(
scalar
(
keys
%perfect_matches
) > 1) {
warn
"There are conflicting rules in the config file for the"
.
" following types: "
;
warn
"\t$_\n"
for
keys
%perfect_matches
;
warn
"Until conflict resolution is built into the converter,"
.
" you will have to manually edit the config file to remove the"
.
" conflict. Sorry :(. Skipping user preference for this entry"
;
sleep
(2);
}
}
if
( !
$mtype
&&
$syn_map
) {
if
(
$feat
->has_tag(
'note'
)) {
my
@all_matches
;
my
@note
=
$feat
->each_tag_value(
'note'
);
for
my
$k
(
keys
%$syn_map
) {
if
(
$k
=~ /
"(.+)"
/) {
my
$syn
= $1;
for
my
$note
(
@note
) {
if
(
$syn
eq
$note
) {
my
@map
= @{
$syn_map
->{
$k
}};
my
$best_guess
=
$map
[0];
unshift
@{
$all_matches
[-1]}, [
$best_guess
];
$mtype
=
$MANUAL
? manual_curation(
$feat
,
$best_guess
, \
@all_matches
)
:
$best_guess
;
print
'#'
x 78 .
"\nGuessing the proper SO term for GenBank"
.
" entry:\n\n"
. GenBank_entry(
$feat
) .
"\nis:\t$mtype\n"
.
'#'
x 78 .
"\n\n"
;
}
else
{
SO_fuzzy_match(
$k
,
$primary_tag
,
$note
,
$syn
, \
@all_matches
);
}
}
}
}
for
my
$note
(
@note
) {
for
my
$name
(
values
%$type_map
) {
SO_fuzzy_match(
$name
,
$primary_tag
,
$note
,
$name
, \
@all_matches
);
}
}
if
(
scalar
(
@all_matches
) && !
$mtype
) {
my
$top_matches
= first {
defined
$_
} @{
$all_matches
[-1]};
my
$best_guess
=
$top_matches
->[0];
if
(
$best_guess
=~ /
"(.+)"
/) {
$best_guess
=
$syn_map
->{
$best_guess
}->[0];
}
@RETURN
=
@all_matches
;
$mtype
=
$MANUAL
? manual_curation(
$feat
,
$best_guess
, \
@all_matches
)
:
$best_guess
;
print
'#'
x 78 .
"\nGuessing the proper SO term for GenBank"
.
" entry:\n\n"
. GenBank_entry(
$feat
) .
"\nis:\t$mtype\n"
.
'#'
x 78 .
"\n\n"
;
}
}
$mtype
||=
$undefmap
;
$feat
->primary_tag(
$mtype
);
}
}
}
sub
SO_fuzzy_match {
my
$candidate
=
shift
;
my
$primary_tag
=
shift
;
my
$note
=
shift
;
my
$SO_terms
=
shift
;
my
$best_matches_ref
=
shift
;
my
$modifier
=
shift
;
$modifier
||=
''
;
my
@feat_terms
;
for
(
split
(
" |_"
,
$primary_tag
) ) {
my
@camelCase
= /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
push
@feat_terms
,
@camelCase
;
}
for
(
split
(
" |_"
,
$note
) ) {
(
my
$word
=
$_
) =~ s/[;:.,]//g;
push
@feat_terms
,
$word
;
}
my
@SO_terms
=
split
(
" |_"
,
$SO_terms
);
my
(
$plus
,
$minus
) = (0, 0);
my
%feat_terms
;
my
%SO_terms
;
map
{
$feat_terms
{
$_
} = 1}
@feat_terms
;
map
{
$SO_terms
{
$_
} = 1}
@SO_terms
;
for
my
$st
(
keys
%SO_terms
) {
for
my
$ft
(
keys
%feat_terms
) {
(
$st
=~ m/
$modifier
\Q
$ft
\E/) ?
$plus
++ :
$minus
++;
}
}
push
@{
$$best_matches_ref
[
$plus
][
$minus
]},
$candidate
if
$plus
;
}
sub
manual_curation {
my
(
$feat
,
$default_opt
,
$all_matches
) =
@_
;
my
@all_matches
=
@$all_matches
;
my
(
@unique_SO_terms
,
%seen
);
for
(
reverse
@all_matches
) {
for
(
@$_
) {
for
(
@$_
) {
if
(
$_
=~ /
"(.+)"
/) {
for
(@{
$SYN_MAP
->{
$_
}}) {
push
@unique_SO_terms
,
$_
unless
$seen
{
$_
};
$seen
{
$_
}++;
}
}
else
{
push
@unique_SO_terms
,
$_
unless
$seen
{
$_
};
$seen
{
$_
}++;
}
}
}
}
my
$s
=
scalar
(
@unique_SO_terms
);
my
$choice
= 0;
my
$more
=
"[a]uto : automatic input (selects best guess for remaining entries)\r"
.
"[f]ind : search for other SO terms matching your query (e.g. f gene)\r"
.
"[i]nput : add a specific term\r"
.
"[r]eset : reset to the beginning of matches\r"
.
"[s]kip : skip this entry (selects best guess for this entry)\r"
;
$more
.=
"[n]ext : view the next "
.OPTION_CYCLE.
" terms\r"
.
"[p]rev : view the previous "
.OPTION_CYCLE.
" terms"
if
(
$s
> OPTION_CYCLE);
my
$msg
=
"The converter found $s possible matches for the following GenBank entry: "
;
my
$directions
=
"Type a number to select the SO term that best matches"
.
" the genbank entry, or use any of the following options:\r"
.
'_'
x 76 .
"\r$more"
;
my
@options
=
map
{
my
$term
=
$_
;
my
%term
;
for
([
'name'
,
'name'
], [
'def'
,
'definition'
], [
'synonym'
,
'each_synonym'
]) {
my
(
$label
,
$method
) =
@$_
;
$term
{
$label
} = \@{[
$term
->
$method
]};
}
[++
$choice
,
$_
->name, (
$_
->definition ||
'none'
), \
%term
,
$_
->each_synonym ];
}
map
{
$ONTOLOGY
->find_terms(
-name
=>
$_
) }
@unique_SO_terms
;
my
$option
= options_cycle(0, OPTION_CYCLE,
$msg
,
$feat
,
$directions
,
$default_opt
,
@options
);
if
(
$option
eq
'skip'
) {
return
$default_opt
}
elsif
(
$option
eq
'auto'
) {
$MANUAL
= 0;
return
$default_opt
;
}
else
{
return
$option
}
}
sub
options_cycle {
my
(
$start
,
$stop
,
$msg
,
$feat
,
$directions
,
$best_guess
,
@opt
) =
@_
;
my
$entry
= GenBank_entry(
$feat
,
"\r"
);
my
$total
=
scalar
(
@opt
);
(
$start
,
$stop
) = (0, OPTION_CYCLE)
if
( (
$start
< 0) && (
$stop
> 0) );
(
$start
,
$stop
) = (0, OPTION_CYCLE)
if
( ( (
$stop
-
$start
) < OPTION_CYCLE ) &&
$stop
<
$total
);
(
$start
,
$stop
) = (
$total
- OPTION_CYCLE,
$total
)
if
$start
< 0;
(
$start
,
$stop
) = (0, OPTION_CYCLE)
if
$start
>=
$total
;
$stop
=
$total
if
$stop
>
$total
;
my
$dir_copy
=
$directions
;
my
$msg_copy
=
$msg
;
my
$format
=
"format STDOUT = \n"
.
'-'
x 156 .
"\n"
.
'^'
.
'<'
x 77 .
'| Available Commands:'
.
"\n"
.
'$msg_copy'
.
"\n"
.
'-'
x 156 .
"\n"
.
' '
x 78 .
"|\n"
.
'^'
.
'<'
x 77 .
'| ^'
.
'<'
x 75 .
'~'
.
"\n"
.
'$entry'
.
' '
x 74 .
'$dir_copy,'
.
"\n"
.
(
' '
x 20 .
'^'
.
'<'
x 57 .
'| ^'
.
'<'
x 75 .
'~'
.
"\n"
.
' '
x 20 .
'$entry,'
.
' '
x 53 .
'$dir_copy,'
.
"\n"
) x 1000 .
".\n"
;
{
no
warnings
'redefine'
;
eval
$format
;
}
write
;
print
'-'
x 156 .
"\n"
.
'Showing results '
. (
$stop
? (
$start
+ 1 ) :
$start
) .
" - $stop of possible SO term matches: (best guess is \"$best_guess\")"
.
"\n"
.
'-'
x 156 .
"\n"
;
for
(
my
$i
=
$start
;
$i
<
$stop
;
$i
+=2) {
my
(
$left
,
$right
) =
@opt
[
$i
,
$i
+1];
my
(
$nL
,
$nmL
,
$descL
,
$termL
,
@synL
) =
@$left
;
my
(
$nR
,
$nmR
,
$descR
,
$termR
,
@synR
) =
ref
(
$right
) ?
@$right
: (
undef
,
undef
,
undef
);
my
$format
=
"format STDOUT = \n"
;
$format
.=
' '
x 78 .
"|\n"
.
'@>>: name: ^'
.
'<'
x 64 .
'~'
.
' |'
.
(
ref
(
$right
) ? (
'@>>: name: ^'
.
'<'
x 64 .
'~'
) :
''
) .
"\n"
.
'$nL,'
.
' '
x 7 .
'$nmL,'
.
(
ref
(
$right
) ? (
' '
x 63 .
'$nR,'
.
' '
x 7 .
"\$nmR,"
) :
''
) .
"\n"
.
' '
x 11 .
'^'
.
'<'
x 61 .
'...~'
.
' |'
.
(
ref
(
$right
) ? (
' ^'
.
'<'
x 61 .
'...~'
) :
''
) .
"\n"
.
' '
x 11 .
'$nmL,'
.
(
ref
(
$right
) ? (
' '
x 74 .
'$nmR,'
) :
''
) .
"\n"
.
' def: ^'
.
'<'
x 65 .
' |'
.
(
ref
(
$right
) ? (
' def: ^'
.
'<'
x 64 .
'~'
) :
''
) .
"\n"
.
' '
x 11 .
'$descL,'
.
(
ref
(
$right
) ? (
' '
x 72 .
'$descR,'
) :
''
) .
"\n"
.
(
' ^'
.
'<'
x 65 .
' |'
.
(
ref
(
$right
) ? (
' ^'
.
'<'
x 64 .
'~'
) :
''
) .
"\n"
.
' '
x 11 .
'$descL,'
.
(
ref
(
$right
) ? (
' '
x 72 .
'$descR,'
) :
''
) .
"\n"
) x 5 .
' ^'
.
'<'
x 61 .
'...~ |'
.
(
ref
(
$right
) ? (
' ^'
.
'<'
x 61 .
'...~'
) :
''
) .
"\n"
.
' '
x 11 .
'$descL,'
.
(
ref
(
$right
) ? (
' '
x 72 .
'$descR,'
) :
''
) .
"\n"
.
".\n"
;
{
no
warnings
'redefine'
;
eval
$format
;
}
write
;
}
print
'-'
x 156 .
"\nenter a command:"
;
while
(<STDIN>) {
(
my
$input
=
$_
) =~ s/\s+$//;
if
(
$input
=~ /^\d+$/) {
if
(
$input
&&
defined
$opt
[
$input
-1] ) {
return
$opt
[
$input
-1]->[1]
}
else
{
print
"\nThat number is not an option. Please enter a valid number.\n:"
;
}
}
elsif
(
$input
=~ /^n/i |
$input
=~ /
next
/i ) {
return
options_cycle(
$start
+ OPTION_CYCLE,
$stop
+ OPTION_CYCLE,
$msg
,
$feat
,
$directions
,
$best_guess
,
@opt
)
}
elsif
(
$input
=~ /^p/i |
$input
=~ /prev/i ) {
return
options_cycle(
$start
- OPTION_CYCLE,
$stop
- OPTION_CYCLE,
$msg
,
$feat
,
$directions
,
$best_guess
,
@opt
)
}
elsif
(
$input
=~ /^s/i ||
$input
=~ /skip/i ) {
return
'skip'
}
elsif
(
$input
=~ /^a/i ||
$input
=~ /auto/i ) {
return
'auto'
}
elsif
(
$input
=~ /^r/i ||
$input
=~ /
reset
/i ) {
return
manual_curation(
$feat
,
$best_guess
, \
@RETURN
);
}
elsif
(
$input
=~ /^f/i ||
$input
=~ /find/i ) {
my
(
$query
,
@query_results
);
if
(
$input
=~ /(?:^f|find)\s+?(.*?)$/) {
$query
= $1;
}
else
{
print
"Type your search query\n:"
;
while
(<STDIN>) {
chomp
(
$query
=
$_
);
last
}
}
for
(
keys
(
%$TYPE_MAP
),
keys
(
%$SYN_MAP
)) {
SO_fuzzy_match(
$_
,
$query
,
''
,
$_
, \
@query_results
,
'(?i)'
);
}
return
manual_curation(
$feat
,
$best_guess
, \
@query_results
);
}
elsif
(
$input
=~ /^i/i ||
$input
=~ /input/i ) {
print
"Type the term you want to use\n:"
;
while
(<STDIN>) {
chomp
(
my
$input
=
$_
);
if
(!
$TYPE_MAP
->{
$input
}) {
print
"\"$input\" doesn't appear to be a valid SO term. Are "
.
"you sure you want to use it? (y or n)\n:"
;
while
(<STDIN>) {
chomp
(
my
$choice
=
$_
);
if
(
$choice
eq
'y'
) {
print
"\nWould you like to save your preference for "
.
"future use (so you don't have to redo manual "
.
"curation for this feature everytime you run "
.
"the converter)? (y or n)\n"
;
while
(<STDIN>) {
chomp
(
my
$choice
=
$_
);
if
(
$choice
eq
'y'
) {
curation_save(
$feat
,
$input
);
return
$input
;
}
elsif
(
$choice
eq
'n'
) {
return
$input
}
else
{
print
"\nDidn't recognize that command. Please "
.
"type y or n.\n:"
}
}
}
elsif
(
$choice
eq
'n'
) {
return
options_cycle(
$start
,
$stop
,
$msg
,
$feat
,
$directions
,
$best_guess
,
@opt
)
}
else
{
print
"\nDidn't recognize that command. Please "
.
"type y or n.\n:"
}
}
}
else
{
print
"\nWould you like to save your preference for "
.
"future use (so you don't have to redo manual "
.
"curation for this feature everytime you run "
.
"the converter)? (y or n)\n"
;
while
(<STDIN>) {
chomp
(
my
$choice
=
$_
);
if
(
$choice
eq
'y'
) {
curation_save(
$feat
,
$input
);
return
$input
;
}
elsif
(
$choice
eq
'n'
) {
return
$input
}
else
{
print
"\nDidn't recognize that command. Please "
.
"type y or n.\n:"
}
}
}
}
}
else
{
print
"\nDidn't recognize that command. Please re-enter your choice.\n:"
}
}
}
sub
GenBank_entry {
my
(
$f
,
$delimiter
,
$num
) =
@_
;
$delimiter
||=
"\n"
;
my
$entry
=
(
$num
?
' [1] '
:
' '
x 5) .
$f
->primary_tag
. (
$num
?
' '
x (12 -
length
$f
->primary_tag ) .
' [2] '
:
' '
x (15 -
length
$f
->primary_tag)
)
.
$f
->start.
'..'
.
$f
->end
.
"$delimiter"
;
if
(
$num
) {
words_tag(
$f
, \
$entry
);
}
else
{
for
my
$tag
(
$f
->all_tags) {
for
my
$val
(
$f
->each_tag_value(
$tag
) ) {
$entry
.=
' '
x 20;
$entry
.=
$val
eq
'_no_value'
?
"/$tag$delimiter"
:
"/$tag=\"$val\"$delimiter"
;
}
}
}
return
$entry
;
}
sub
gff_validate {
warn
"Validating GFF file\n"
if
$DEBUG
;
my
@feat
=
@_
;
my
(
%parent2child
,
%all_ids
,
%descendants
,
%reserved
);
for
my
$f
(
@feat
) {
for
my
$aTags
([
'Parent'
, \
%parent2child
], [
'ID'
, \
%all_ids
]) {
map
{
push
@{
$$aTags
[1]->{
$_
}},
$f
}
$f
->get_tag_values(
$$aTags
[0])
if
$f
->has_tag(
$$aTags
[0]);
}
}
if
(
$SO_FILE
) {
while
(
my
(
$parentID
,
$aChildren
) =
each
%parent2child
) {
parent_validate(
$parentID
,
$aChildren
, \
%all_ids
, \
%descendants
, \
%reserved
);
}
}
id_validate(\
%all_ids
, \
%reserved
);
}
sub
parent_validate {
my
(
$parentID
,
$aChildren
,
$hAll
,
$hDescendants
,
$hReserved
) =
@_
;
my
$aParents
=
$hAll
->{
$parentID
};
map
{
my
$child
=
$_
;
$child
->add_tag_value(
validation_error
=>
"feature tried to add Parent tag, but no Parent found with ID $parentID"
);
my
%parents
;
map
{
$parents
{
$_
} = 1 }
$child
->get_tag_values(
'Parent'
);
delete
$parents
{
$parentID
};
my
@parents
=
keys
%parents
;
$child
->remove_tag(
'Parent'
);
unless
(
$child
->has_tag(
'ID'
)) {
my
$id
= gene_name(
$child
);
$child
->add_tag_value(
'ID'
,
$id
);
push
@{
$hAll
->{
$id
}},
$child
}
$child
->add_tag_value(
'Parent'
,
@parents
)
if
@parents
;
}
@$aChildren
and
return
unless
scalar
(
@$aParents
);
my
$par
=
join
(
','
,
map
{
$_
->primary_tag }
@$aParents
);
warn
scalar
(
@$aParents
).
" POSSIBLE PARENT(S): $par"
if
$DEBUG
;
my
%parentsToRemove
;
CHILD:
for
my
$child
(
@$aChildren
) {
my
$childType
=
$child
->primary_tag;
warn
"WORKING ON $childType at "
.
$child
->start.
' to '
.
$child
->end
if
$DEBUG
;
for
(
my
$i
= 0;
$i
<
scalar
(
@$aParents
);
$i
++) {
my
$parent
=
$aParents
->[
$i
];
my
$parentType
=
$parent
->primary_tag;
warn
"CHECKING $childType against $parentType"
if
$DEBUG
;
unless
(
$hDescendants
->{
$parentType
}) {
for
my
$term
(
$ONTOLOGY
->find_terms(
-name
=>
$parentType
) ) {
map
{
$hDescendants
->{
$parentType
}{
$_
->name}++
}
$ONTOLOGY
->get_descendant_terms(
$term
);
}
if
(
$parentType
eq
'mRNA'
) {
$hDescendants
->{
$parentType
}{
'exon'
} = 1;
$hDescendants
->{
$parentType
}{
'CDS'
} = 1;
}
}
warn
"\tCAN $childType at "
.
$child
->start .
' to '
.
$child
->end .
" be a child of $parentType?"
if
$DEBUG
;
if
(
$hDescendants
->{
$parentType
}{
$childType
}) {
warn
"\tYES, $childType can be a child of $parentType"
if
$DEBUG
;
$hReserved
->{
$parentID
}{
$parent
}{
'parent'
} =
$parent
;
push
@{
$hReserved
->{
$parentID
}{
$parent
}{
'children'
}},
$child
;
$parentsToRemove
{
$i
}++;
next
CHILD;
}
}
if
(
$child
->has_tag(
'Parent'
) ) {
warn
"\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
if
$DEBUG
;
my
%parents
;
map
{
$parents
{
$_
} = 1 }
$child
->get_tag_values(
'Parent'
);
delete
$parents
{
$parentID
};
my
@parents
=
keys
%parents
;
warn
'VALIDATION ERROR '
.
$child
->primary_tag.
" at "
.
$child
->start .
' to '
.
$child
->end .
" cannot be a child of ID $parentID"
if
$DEBUG
;
$child
->add_tag_value(
validation_error
=>
"feature cannot be a child of $parentID"
);
$child
->remove_tag(
'Parent'
);
unless
(
$child
->has_tag(
'ID'
)) {
my
$id
= gene_name(
$child
);
$child
->add_tag_value(
'ID'
,
$id
);
push
@{
$hAll
->{
$id
}},
$child
}
$child
->add_tag_value(
'Parent'
,
@parents
)
if
@parents
;
}
}
splice
(
@$aParents
,
$_
, 1)
for
keys
%parentsToRemove
;
}
sub
id_validate {
my
(
$hAll
,
$hReserved
) =
@_
;
for
my
$id
(
keys
%$hAll
) {
shift
@{
$hAll
->{
$id
}}
unless
$hReserved
->{
$id
};
for
my
$feat
(@{
$hAll
->{
$id
}}) {
id_uniquify(0,
$id
,
$feat
,
$hAll
);
}
}
for
my
$parentID
(
keys
%$hReserved
) {
my
@keys
=
keys
%{
$hReserved
->{
$parentID
}};
shift
@keys
;
for
my
$k
(
@keys
) {
my
$parent
=
$hReserved
->{
$parentID
}{
$k
}{
'parent'
};
my
$aChildren
=
$hReserved
->{
$parentID
}{
$k
}{
'children'
};
my
$value
= id_uniquify(0,
$parentID
,
$parent
,
$hAll
);
for
my
$child
(
@$aChildren
) {
my
%parents
;
map
{
$parents
{
$_
}++ }
$child
->get_tag_values(
'Parent'
);
$child
->remove_tag(
'Parent'
);
delete
$parents
{
$parentID
};
$parents
{
$value
}++;
my
@parents
=
keys
%parents
;
$child
->add_tag_value(
'Parent'
,
@parents
);
}
}
}
}
sub
id_uniquify {
my
(
$count
,
$value
,
$feat
,
$hAll
) =
@_
;
warn
"UNIQUIFYING $value"
if
$DEBUG
;
if
(!
$count
) {
$feat
->add_tag_value(
Alias
=>
$value
);
$value
.= (
'.'
.
$feat
->primary_tag)
}
elsif
(
$count
== 1) {
$value
.=
".$count"
}
else
{
chop
$value
;
$value
.=
$count
}
$count
++;
warn
"ENDED UP WITH $value"
if
$DEBUG
;
if
(
$hAll
->{
$value
} ) {
warn
"$value IS ALREADY TAKEN"
if
$DEBUG
;
id_uniquify(
$count
,
$value
,
$feat
,
$hAll
);
}
else
{
$feat
->remove_tag(
'ID'
);
$feat
->add_tag_value(
'ID'
,
$value
);
push
@{
$hAll
->{
$value
}},
$value
;
}
$value
;
}
sub
conf_read {
print
"\nCannot read $CONF. Change file permissions and retry, "
.
"or enter another file\n"
and conf_locate()
unless
-r
$CONF
;
print
"\nCannot write $CONF. Change file permissions and retry, "
.
"or enter another file\n"
and conf_locate()
unless
-w
$CONF
;
$YAML
= LoadFile(
$CONF
);
}
sub
conf_create {
my
(
$path
,
$input
) =
@_
;
print
"Cannot write to $path. Change directory permissions and retry "
.
"or enter another save path\n"
and conf_locate()
unless
-w
$path
;
$CONF
=
$input
;
open
(FH,
'>'
,
$CONF
);
close
(FH);
conf_read();
}
sub
conf_write { DumpFile(
$CONF
,
$YAML
) }
sub
conf_locate {
print
"\nEnter the location of a previously saved config, or a new "
.
"path and file name to create a new config (this step is "
.
"necessary to save any preferences)"
;
print
"\n\nenter a command:"
;
while
(<STDIN>) {
chomp
(
my
$input
=
$_
);
my
(
$fn
,
$path
,
$suffix
) = fileparse(
$input
,
qr/\.[^.]*/
);
if
(-e
$input
&& (! -d
$input
)) {
print
"\nReading $input...\n"
;
$CONF
=
$input
;
conf_read();
last
;
}
elsif
(! -d
$input
&&
$fn
.
$suffix
) {
print
"Creating $input...\n"
;
conf_create(
$path
,
$input
);
last
;
}
elsif
(-e
$input
&& -d
$input
) {
print
"You only entered a directory. "
.
"Please enter BOTH a directory and filename\n"
;
}
else
{
print
"$input does not appear to be a valid path. Please enter a "
.
"valid directory and filename\n"
;
}
print
"\nenter a command:"
;
}
}
sub
curation_save {
my
(
$feat
,
$input
) =
@_
;
if
(!
$CONF
) {
print
"\n\n"
;
conf_locate();
}
elsif
(! -e
$CONF
) {
print
"\n\nThe config file you have chosen doesn't exist.\n"
;
conf_locate();
}
else
{ conf_read() }
my
$entry
= GenBank_entry(
$feat
,
"\r"
, 1);
my
$msg
=
"Term entered: $input"
;
my
$directions
= "Please
select
any/all tags that provide evidence
for
the term you
have entered. You may enter multiple tags by separating them by commas/dashes
(e.g 1,3,5-7). For tags
with
more than one word value (i.e
'note'
), you have
the option of either selecting the entire note as evidence, or specific
keywords. If a tag
has
multiple keywords, they will be tagged alphabetically
for
selection. To
select
a specific keyword in a tag field, you must enter the
tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
selected by entering
each
letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you
select
, the more specific the GenBank entry will have
to be to match your curation. To match the GenBank entry exactly as it
appears, type every number (start-end), or just type
'all'
. Remember, once the converter saves your
preference, you will
no
longer be prompted to choose a feature type
for
any
matching entries
until
you edit the curation.ini file.";
my
$msg_copy
=
$msg
;
my
$dir_copy
=
$directions
;
my
$format
=
"format STDOUT = \n"
.
'-'
x 156 .
"\n"
.
'^'
.
'<'
x 77 .
'| Directions:'
.
"\n"
.
'$msg_copy'
.
"\n"
.
'-'
x 156 .
"\n"
.
' '
x 78 .
"|\n"
.
'^'
.
'<'
x 77 .
'| ^'
.
'<'
x 75 .
'~'
.
"\n"
.
'$entry'
.
' '
x 74 .
'$dir_copy,'
.
"\n"
.
(
' '
x 15 .
'^'
.
'<'
x 62 .
'| ^'
.
'<'
x 75 .
'~'
.
"\n"
.
' '
x 15 .
'$entry,'
.
' '
x 58 .
'$dir_copy,'
.
"\n"
) x 20 .
".\n"
;
{
no
warnings
'redefine'
;
eval
$format
;
}
write
;
print
'-'
x 156 .
"\nenter a command:"
;
my
@tags
= words_tag(
$feat
);
my
$final
= {};
my
$choices
;
while
(<STDIN>) {
chomp
(
my
$choice
=
$_
);
if
(
scalar
(
keys
%$final
) &&
$choice
=~ /^y/i) {
last
}
elsif
(
scalar
(
keys
%$final
) &&
$choice
=~ /^n/i) { curation_save(
$feat
,
$input
)
}
elsif
(
scalar
(
keys
%$final
)) {
print
"\nInvalid selection. Please try again\n"
;
}
elsif
(
$choice
eq
'all'
) {
$choice
=
''
;
for
(
my
$i
=1;
$i
<
scalar
(
@tags
);
$i
++) {
$choice
.=
"$i,"
;
}
chop
$choice
;
}
my
@selections
;
for
(
split
(/(?<=\w)[^[:alnum:]\-]+(?=\d)/,
$choice
)) {
if
(
$_
=~ /(\d+)(?:\D*)-(\d+)(.*)/) {
for
($1..$2) {
push
@selections
,
$_
}
my
$dangling_alphas
= $3;
alpha_expand(
$dangling_alphas
, \
@selections
);
}
else
{
alpha_expand(
$_
, \
@selections
);
}
}
foreach
my
$numbers
(
@selections
) {
my
@c
=
split
(/(?=[\w])/,
$numbers
);
s/\W+//g
foreach
@c
;
my
$num
;
{
$^W = 0;
$num
= 0 +
shift
@c
;
}
my
$tag
=
$tags
[
$num
];
if
(
ref
$tag
&&
scalar
(
@c
)) {
my
$no_value
;
foreach
(
@c
) {
if
(
defined
$tag
->{
$_
}) {
$choices
.=
"${num}[$_] "
;
my
(
$t
,
$v
) = @{
$tag
->{
$_
}};
push
@{${
$final
->{
$input
}}[0]{
$t
}},
$v
;
}
else
{
$no_value
++ }
}
if
(
$no_value
) {
_selection_add(
$tag
,
$final
,
$input
,\
$choices
,
$num
);
}
$choices
=
substr
(
$choices
, 0, -2);
$choices
.=
', '
;
}
elsif
(
ref
$tag
) {
_selection_add(
$tag
,
$final
,
$input
,\
$choices
,
$num
);
}
}
$choices
=
substr
(
$choices
, 0, -2)
if
$choices
;
if
(
$final
) {
print
"\nYou have chosen the following tags:\n$choices\n"
;
print
"This will be written to the config file as:\n"
;
print
Dump
$final
;
print
"\nIs this correct? (y or n)\n"
;
}
else
{
print
"\nInvalid selection. Please try again\n"
}
}
push
@{
$YAML
->{
$input
}},
$final
->{
$input
};
conf_write();
}
sub
words_tag {
my
(
$feat
,
$entry
) =
@_
;
my
@tags
;
@tags
[1,2] = ({
'all'
=> [
'primary_tag'
,
$feat
->primary_tag]}, {
'all'
=> [
'location'
,
$feat
->start.
'..'
.
$feat
->end]});
my
$i
= 3;
foreach
my
$tag
(
$feat
->all_tags) {
foreach
my
$value
(
$feat
->each_tag_value(
$tag
)) {
my
(
$string
,
$tagged_string
);
my
@words
=
split
(/(?=\w+?)/,
$value
);
my
$pos
= 0;
foreach
my
$word
(
@words
) {
(
my
$sanitized_word
=
$word
) =~ s/\W+?//g;
$string
.=
$word
;
my
$lead
=
int
(
$pos
/ALPHABET_DIVISOR);
my
$lag
=
$pos
% ALPHABET_DIVISOR;
my
$a
=
$lead
? ${(ALPHABET)}[
$lead
-1] :
''
;
$a
.=
$lag
? ${(ALPHABET)}[
$lag
] :
'a'
;
$tagged_string
.=
" ($a) $word"
;
$tags
[
$i
]{
$a
} = [
$tag
,
$sanitized_word
];
$pos
++;
}
$value
=
$tagged_string
if
scalar
(
@words
) > 1;
$$entry
.=
"[$i] /$tag=\"$value\"\r"
;
$tags
[
$i
]{
'all'
} = [
$tag
,
$string
];
}
$i
++;
}
return
@tags
;
}
sub
alpha_expand {
my
(
$dangling_alphas
,
$selections
) =
@_
;
if
(
defined
(
$dangling_alphas
) &&
$dangling_alphas
=~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
my
$digit
= $1;
push
@$selections
,
$digit
if
$digit
;
my
$start
= $2;
my
$stop
= $3;
my
@starts
=
split
(
''
,
$start
);
my
@stops
=
split
(
''
,
$stop
);
my
(
$final_start
,
$final_stop
);
for
([\
$final_start
, \
@starts
], [\
$final_stop
, \
@stops
]) {
my
(
$final
,
$splits
) =
@$_
;
my
$int
= ${(ALPHABET_TO_NUMBER)}{
$$splits
[0]};
my
$rem
;
if
(
$$splits
[1]) {
$rem
= ${(ALPHABET_TO_NUMBER)}{
$$splits
[1]};
$int
++
}
else
{
$rem
=
$int
;
$int
= 0;
}
$$final
=
$int
* ALPHABET_DIVISOR;
$$final
+=
$rem
;
}
my
$last_number
=
pop
@$selections
;
for
my
$pos
(
$final_start
..
$final_stop
) {
my
$lead
=
int
(
$pos
/ALPHABET_DIVISOR);
my
$lag
=
$pos
% ALPHABET_DIVISOR;
my
$alpha
=
$lead
? ${(ALPHABET)}[
$lead
-1] :
''
;
$alpha
.=
$lag
? ${(ALPHABET)}[
$lag
] :
'a'
;
push
@$selections
,
$last_number
.
$alpha
;
}
}
elsif
(
defined
(
$dangling_alphas
)) {
if
(
$dangling_alphas
=~ /^\d/) {
push
@$selections
,
$dangling_alphas
;
}
elsif
(
$dangling_alphas
=~ /^\D/) {
my
$last_number
=
pop
@$selections
;
$last_number
||=
''
;
push
@$selections
,
$last_number
.
$dangling_alphas
;
}
}
}
sub
_selection_add {
my
(
$tag
,
$final
,
$input
,
$choices
,
$num
) =
@_
;
my
(
$t
,
$v
) = @{
$tag
->{
'all'
}};
unless
(
defined
${
$final
->{
$input
}}[0]{
$t
}) {
$$choices
.=
"$num, "
;
push
@{${
$final
->{
$input
}}[0]{
$t
}},
$v
}
}