#!/usr/bin/perl
my
%CHROMOSOMES
= (
I
=> 230_203,
II
=> 813_139,
III
=> 316_613,
IV
=> 1_531_929,
V
=> 576_869,
VI
=> 270_148,
VII
=> 1_090_937,
VIII
=> 562_639,
IX
=> 439_885,
X
=> 745_444,
XI
=> 666_445,
XII
=> 1_078_173,
XIII
=> 924_430,
XIV
=> 784_328,
XV
=> 1_091_284,
XVI
=> 948_061,
Mit
=> 85_779);
my
@ROMAN
=
qw(I II III IV V VI VII VIII IX X
XI XII XIII XIV XV XVI Mit)
;
if
(
$ARGV
[0] =~ /^--?h/) {
die
<<USAGE;
Usage: $0 <SGD features file>
This script massages the SGD sequence annotation flat files located at
into a version of the GFF format suitable for display by the generic
genome browser.
To use this script, get the SGD chromosomal_features.tab file from the
FTP site listed above, and run the following command:
% process_sgd.pl chromosomal_features.tab > yeast.gff
The yeast.gff file can then be loaded into a Bio::DB::GFF database
using the following command:
% bulk_load_gff.pl -d <databasename> yeast.gff
USAGE
;
}
for
my
$chrom
(
sort
keys
%CHROMOSOMES
) {
print
join
(
"\t"
,
$chrom
,
'chromosome'
,
'Component'
,1,
$CHROMOSOMES
{
$chrom
},
'.'
,
'.'
,
'.'
,
qq(Sequence "$chrom")
),
"\n"
;
}
while
(<>) {
chomp
;
my
(
$id
,
$gene
,
$aliases
,
$type
,
$chromosome
,
$start
,
$stop
,
$strand
,
$sgdid
,
$sgdid2
,
$description
,
$date
) =
split
"\t"
;
my
$ref
=
$ROMAN
[
$chromosome
-1];
$description
=~ s/
"/\\"
/g;
$description
=~ s/;/\\;/g;
$strand
=
$strand
eq
'W'
?
'+'
:
'-'
;
(
$start
,
$stop
) = (
$stop
,
$start
)
if
$strand
eq
'-'
;
die
"Strand logic is messed up"
if
$stop
<
$start
;
if
(
$gene
) {
my
@aliases
=
split
(/\|/,
$aliases
);
my
$aliases
=
join
" ; "
,
map
{
qq(Alias "$_")
}
@aliases
;
my
$group
=
qq(Gene "$gene" ; Note "$description")
;
$group
.=
" ; $aliases"
if
$aliases
;
print
join
(
"\t"
,
$ref
,
'sgd'
,
'gene'
,
$start
,
$stop
,
'.'
,
$strand
,
'.'
,
$group
),
"\n"
;
$description
.=
"\\; AKA @aliases"
if
@aliases
;
}
print
join
(
"\t"
,
$ref
,
'sgd'
,
$type
,
$start
,
$stop
,
'.'
,
$strand
,
'.'
,
qq($type "$id" ; Note "$description")
),
"\n"
;
}