- --pathway-file
-
- This is a tab-delimited file prepared from a pathway database (such as KEGG), with the columns: [path_id, path_name, class, gene_line, diseases, drugs, description] The latter three columns are optional (but are available on KEGG). The gene_line contains the "entrez_id:gene_name" of all genes involved in this pathway, each separated by a "|" symbol.
-
For example, a line in the pathway-file would look like:
hsa00061 Fatty acid biosynthesis Lipid Metabolism 31:ACACA|32:ACACB|27349:MCAT|2194:FASN|54995:OXSM|55301:OLAH
Ensure that the gene names and entrez IDs used match those used in the MAF file. Entrez IDs are not mandatory (use a 0 if Entrez ID unknown). But if a gene name in the MAF does not match any gene name in this file, the entrez IDs are used to find a match (unless it's a 0).
- --gene-covg-dir
- --bam-list
-
- Provide a file containing sample names and normal/tumor BAM locations for each. Use the tab- delimited format [sample_name normal_bam tumor_bam] per line. This tool only needs sample_name, so all other columns can be skipped. The sample_name must be the same as the tumor sample names used in the MAF file (16th column, with the header Tumor_Sample_Barcode).
- --bmr
- --genes-to-ignore
sub _doc_authors { return <<EOS Michael Wendl, Ph.D. EOS }
sub _doc_credits { return <<EOS This module uses reformatted copies of data from the Kyoto Encyclopedia of Genes and Genomes (KEGG) database:
sub execute { my $self = shift; my $covg_dir = $self->gene_covg_dir; my $bam_list = $self->bam_list; my $pathway_file = $self->pathway_file; my $maf_file = $self->maf_file; my $output_file = $self->output_file; my $bgd_mut_rate = $self->bmr; my $genes_to_ignore = $self->genes_to_ignore; my $min_mut_genes_per_path = $self->min_mut_genes_per_path; my $skip_non_coding = $self->skip_non_coding; my $skip_silent = $self->skip_silent;
# Check on all the input data before starting work
STDERR
"MAF file not found or is empty: $maf_file\n"
unless
( -s
$maf_file
);
STDERR
"Directory with gene coverages not found: $covg_dir\n"
unless
( -e
$covg_dir
);
STDERR
"List of samples not found or is empty: $bam_list\n"
unless
( -s
$bam_list
);
STDERR
"Pathway info file not found or is empty: $pathway_file\n"
unless
( -s
$pathway_file
);
return
undef
unless
( -s
$maf_file
&& -e
$covg_dir
&& -s
$bam_list
&& -s
$pathway_file
);
# Build a hash to quickly lookup the genes whose mutations should be ignored
my
%ignored_genes
= ();
if
(
defined
$genes_to_ignore
)
{
%ignored_genes
=
map
{
$_
=> 1 }
split
( /,/,
$genes_to_ignore
);
}
# PathScan uses a helluva lot of hashes - all your RAM are belong to it
my
%sample_gene_hash
;
# sample => array of genes (based on maf)
my
%gene_path_hash
;
# gene => array of pathways (based on path_file)
my
%path_hash
;
# pathway => all the information about the pathways in the database
my
%sample_path_hash
;
# sample => pathways (based on %sample_gene_hash and %gene_path_hash)
my
%path_sample_hits_hash
;
# path => sample => hits,mutated_genes
my
%gene_sample_cov_hash
;
# gene => sample => coverage
my
@all_sample_names
;
# names of all the samples, no matter if it's mutated or not
my
%id_gene_hash
;
# entrez id => gene (based on first two columns in MAF)
# Parse out the names of the samples which should match the names in the MAF file
my
$sampleFh
= IO::File->new(
$bam_list
) or
die
"Couldn't open $bam_list. $!\n"
;
while
(
my
$line
=
$sampleFh
->getline )
{
next
if
(
$line
=~ m/^
#/ );
chomp
(
$line
);
my
(
$sample
) =
split
( /\t/,
$line
);
push
(
@all_sample_names
,
$sample
);
}
$sampleFh
->
close
;
# Read coverage data calculated by the Music::Bmr::CalcCovg
$covg_dir
=~ s/(\/)+$//;
# Remove trailing forward slashes if any
read_CoverageFiles(
$covg_dir
, \
@all_sample_names
, \
%gene_sample_cov_hash
);
#build gene => average_coverage hash for population test
my
%gene_cov_hash
;
foreach
my
$gene
(
keys
%gene_sample_cov_hash
)
{
my
$total_cov
= 0;
my
$sample_num
=
scalar
(
@all_sample_names
);
$total_cov
+=
$gene_sample_cov_hash
{
$gene
}{
$_
}
foreach
(
@all_sample_names
);
$gene_cov_hash
{
$gene
} =
int
(
$total_cov
/
$sample_num
);
}
#build %sample_gene_hash based on maf
my
$maf_fh
= IO::File->new(
$maf_file
);
while
(
my
$line
=
$maf_fh
->getline )
{
next
if
(
$line
=~ m/^(
#|Hugo_Symbol)/ );
chomp
(
$line
);
my
@cols
=
split
( /\t/,
$line
);
my
(
$gene
,
$entrez_id
,
$mutation_class
,
$tumor_sample
) = (
$cols
[0],
$cols
[1],
$cols
[8],
$cols
[15] );
# If the mutation classification is odd, quit with error
if
(
$mutation_class
!~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|Silent|In_Frame_Ins|Intron|RNA|3
'Flank|3'
UTR|5
'Flank|5'
UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ )
{
STDERR
"Unrecognized Variant_Classification $mutation_class in MAF file.\n"
;
STDERR
"Please use TCGA MAF Specification v2.3.\n"
;
return
undef
;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if
((
$skip_non_coding
&&
$mutation_class
=~ m/^(Intron|RNA|3
'Flank|3'
UTR|5
'Flank|5'
UTR|IGR|Targeted_Region)$/ ) ||
(
$skip_silent
&&
$mutation_class
=~ m/^Silent$/ ))
{
STDERR
"Skipping $mutation_class mutation in gene $gene.\n"
;
next
;
}
# Check that the user followed instructions and named each sample correctly
unless
(
grep
( /^
$tumor_sample
$/,
@all_sample_names
))
{
STDERR
"Sample $tumor_sample in MAF file does not match any in $bam_list\n"
;
return
undef
;
}
next
if
(
defined
$ignored_genes
{
$gene
} );
# Ignore variants in genes that user wants ignored
$id_gene_hash
{
$entrez_id
} =
$gene
unless
(
$entrez_id
eq
''
or
$entrez_id
== 0 or
$entrez_id
!~ m/^\d+$/ );
push
( @{
$sample_gene_hash
{
$tumor_sample
}},
$gene
)
unless
(
grep
/^
$gene
$/, @{
$sample_gene_hash
{
$tumor_sample
}} );
}
$maf_fh
->
close
;
my
$path_fh
= IO::File->new(
$pathway_file
);
while
(
my
$line
=
$path_fh
->getline )
{
chomp
(
$line
);
next
if
(
$line
=~ /^(
#|ID)/ ); #Skip headers
my
(
$path_id
,
$name
,
$class
,
$gene_line
,
$diseases
,
$drugs
,
$description
) =
split
( /\t/,
$line
);
my
@genes
=
split
( /\|/,
$gene_line
);
#Each gene is in the format "EntrezID:GeneSymbol"
$diseases
=~ s/\|/, /g;
#Change the separators to commas
$drugs
=~ s/\|/, /g;
#Change the separators to commas
$path_hash
{
$path_id
}{name} =
$name
unless
(
$name
eq
''
);
$path_hash
{
$path_id
}{class} =
$class
unless
(
$class
eq
''
);
$path_hash
{
$path_id
}{diseases} =
$diseases
unless
(
$diseases
eq
''
);
$path_hash
{
$path_id
}{drugs} =
$drugs
unless
(
$drugs
eq
''
);
$path_hash
{
$path_id
}{description} =
$description
unless
(
$description
eq
''
);
@{
$path_hash
{
$path_id
}{gene}} = ();
foreach
my
$gene
(
@genes
)
{
my
(
$entrez_id
,
$gene_symbol
) =
split
( /:/,
$gene
);
unless
(
$entrez_id
eq
''
or
$entrez_id
== 0 or
$entrez_id
!~ m/^\d+$/ )
{
# Use the gene name from the MAF file if the entrez ID matches
$gene_symbol
=
$id_gene_hash
{
$entrez_id
}
if
(
defined
$id_gene_hash
{
$entrez_id
} );
}
push
( @{
$gene_path_hash
{
$gene_symbol
}},
$path_id
)
unless
(
grep
/^
$path_id
$/, @{
$gene_path_hash
{
$gene_symbol
}} );
unless
(
grep
/^
$gene_symbol
$/, @{
$path_hash
{
$path_id
}{gene}} )
{
push
( @{
$path_hash
{
$path_id
}{gene}},
$gene_symbol
);
}
}
}
$path_fh
->
close
;
#build a sample => pathway hash
foreach
my
$sample
(
keys
%sample_gene_hash
)
{
foreach
my
$gene
( @{
$sample_gene_hash
{
$sample
}} )
{
if
(
defined
$gene_path_hash
{
$gene
} )
{
foreach
my
$pathway
( @{
$gene_path_hash
{
$gene
}} )
{
push
( @{
$sample_path_hash
{
$sample
}},
$pathway
)
unless
(
grep
/^
$pathway
$/, @{
$sample_path_hash
{
$sample
}} );
}
}
}
}
#build path_sample_hits_hash, for population test
foreach
my
$sample
(
keys
%sample_path_hash
)
{
foreach
my
$path
( @{
$sample_path_hash
{
$sample
}} )
{
my
$hits
= 0;
my
@mutated_genes
= ();
#Mutated genes in this sample belonging to this pathway
my
@mutated_genes_in_sample
= @{
$sample_gene_hash
{
$sample
}};
foreach
my
$gene
( @{
$path_hash
{
$path
}{gene}} )
{
if
(
grep
/^
$gene
$/,
@mutated_genes_in_sample
)
#if this gene is mutated in this sample (in maf)
{
$hits
++;
push
(
@mutated_genes
,
$gene
);
}
}
if
(
$hits
> 0 )
{
$path_sample_hits_hash
{
$path
}{
$sample
}{hits} =
$hits
;
$path_sample_hits_hash
{
$path
}{
$sample
}{mutated_genes} = \
@mutated_genes
;
}
}
}
#Calculation of p value
my
%data
;
#For printing
my
@pvals
;
foreach
my
$path
(
sort
keys
%path_hash
)
{
my
@pathway_genes
= @{
$path_hash
{
$path
}{gene}};
my
@gene_sizes
= ();
foreach
my
$gene
(
@pathway_genes
)
{
if
(
defined
$gene_cov_hash
{
$gene
} )
{
my
$avg_cov
=
int
(
$gene_cov_hash
{
$gene
} );
push
(
@gene_sizes
,
$avg_cov
)
if
(
$avg_cov
> 3 );
}
}
#If this pathway doesn't have any gene coverage, skip it
next
unless
(
scalar
(
@gene_sizes
) > 0 );
my
@num_hits_per_sample
;
#store hits info for each patient
my
@mutated_samples
=
sort
keys
%{
$path_sample_hits_hash
{
$path
}};
foreach
my
$sample
(
@all_sample_names
)
{
my
$hits
= 0;
#if this sample has mutation
if
(
grep
/^
$sample
$/,
@mutated_samples
)
{
$hits
=
$path_sample_hits_hash
{
$path
}{
$sample
}{hits};
}
push
(
@num_hits_per_sample
,
$hits
);
}
#If this pathway doesn't have any mutated genes in any samples, skip it
next
unless
(
scalar
(
@num_hits_per_sample
) > 0 );
my
$hits_ref
= \
@num_hits_per_sample
;
########### MCW ADDED
# FIND MAX NUMBER OF HITS IN A SAMPLE
my
$max_hits
= 0;
foreach
my
$hits_in_sample
(
@num_hits_per_sample
)
{
$max_hits
=
$hits_in_sample
if
(
$hits_in_sample
>
$max_hits
);
}
########### MCW ADDED
my
$pop_obj
= Genome::Model::Tools::Music::PathScan::PopulationPathScan->new( \
@gene_sizes
);
if
(
scalar
(
@gene_sizes
) >= 3 )
{
########### MCW ADDED
if
(
$max_hits
> 15 )
{
$pop_obj
->assign( 5 );
}
else
{
$pop_obj
->assign( 3 );
}
########### MCW ADDED
#$pop_obj->assign(3);
}
elsif
(
@gene_sizes
== 2 )
{
$pop_obj
->assign( 2 );
}
else
{
$pop_obj
->assign( 1 );
}
$pop_obj
->preprocess(
$bgd_mut_rate
,
$hits_ref
);
#mwendl's new fix
my
$pval
=
$pop_obj
->population_pval_approx(
$hits_ref
);
$data
{
$pval
}{
$path
}{samples} = \
@mutated_samples
;
$data
{
$pval
}{
$path
}{hits} =
$hits_ref
;
push
(
@pvals
,
$pval
);
# For calculation of FDR
}
# Calculate False Discovery Rates (Benjamini-Hochberg FDR) for the p-values
my
$pval_cnt
=
scalar
(
@pvals
);
my
%fdr_hash
;
for
(
my
$i
= 0;
$i
<
$pval_cnt
;
$i
++ )
{
my
$fdr
=
$pvals
[
$i
] *
$pval_cnt
/ (
$pval_cnt
-
$i
);
$fdr
= 1
if
$fdr
> 1;
$fdr_hash
{
$pvals
[
$i
]} =
$fdr
;
}
# Print two output files, one more detailed than the other
my
$out_fh
= IO::File->new(
$output_file
,
">"
);
my
$out_detailed_fh
= IO::File->new(
"$output_file\_detailed"
,
">"
);
$out_fh
->
(
"Pathway\tName\tClass\tSamples_Affected\tTotal_Variations\tp-value\tFDR\n"
);
foreach
my
$pval
(
sort
{
$a
<=>
$b
}
keys
%data
)
{
foreach
my
$path
(
sort
keys
%{
$data
{
$pval
}} )
{
# Skip this pathway if it has fewer affected genes than the user wants
my
%mutated_gene_hash
;
my
@samples
= @{
$data
{
$pval
}{
$path
}{samples}};
foreach
my
$sample
(
@samples
)
{
foreach
my
$gene
( @{
$path_sample_hits_hash
{
$path
}{
$sample
}{mutated_genes}} )
{
$mutated_gene_hash
{
$gene
}++;
}
}
next
unless
(
scalar
(
keys
%mutated_gene_hash
) >=
$min_mut_genes_per_path
);
# Print detailed output to a separate output file
$out_detailed_fh
->
(
"Pathway: $path\n"
);
$out_detailed_fh
->
(
"Name: "
,
$path_hash
{
$path
}{name},
"\n"
)
if
(
defined
$path_hash
{
$path
}{name} );
$out_detailed_fh
->
(
"Class: "
,
$path_hash
{
$path
}{class},
"\n"
)
if
(
defined
$path_hash
{
$path
}{class} );
$out_detailed_fh
->
(
"Diseases: "
,
$path_hash
{
$path
}{diseases},
"\n"
)
if
(
defined
$path_hash
{
$path
}{diseases} );
$out_detailed_fh
->
(
"Drugs: "
,
$path_hash
{
$path
}{drugs},
"\n"
)
if
(
defined
$path_hash
{
$path
}{drugs} );
$out_detailed_fh
->
(
"P-value: $pval\n"
,
"FDR: "
,
$fdr_hash
{
$pval
},
"\n"
);
$out_detailed_fh
->
(
"Description: "
,
$path_hash
{
$path
}{description},
"\n"
);
my
@hits
= @{
$data
{
$pval
}{
$path
}{hits}};
foreach
my
$sample
(
@samples
)
{
my
@mutated_genes
= @{
$path_sample_hits_hash
{
$path
}{
$sample
}{mutated_genes}};
$out_detailed_fh
->
(
"$sample:"
);
$out_detailed_fh
->
(
join
","
,
@mutated_genes
);
$out_detailed_fh
->
(
"\n"
);
}
my
(
$mutSampleCnt
,
$totalMutGenes
) = ( 0, 0 );
$out_detailed_fh
->
(
"Samples with mutations (#hits): "
);
for
(
my
$i
= 0;
$i
<
scalar
(
@all_sample_names
); ++
$i
)
{
if
(
$hits
[
$i
] > 0 )
{
$out_detailed_fh
->
(
"$all_sample_names[$i]($hits[$i]) "
);
$mutSampleCnt
++;
$totalMutGenes
+=
$hits
[
$i
];
}
}
$out_detailed_fh
->
(
"\n\n"
);
# Print tabulated output to the main output file
my
(
$path_name
,
$path_class
) = (
"-"
,
"-"
);
$path_name
=
$path_hash
{
$path
}{name}
if
(
defined
$path_hash
{
$path
}{name} );
$path_class
=
$path_hash
{
$path
}{class}
if
(
defined
$path_hash
{
$path
}{class} );
$out_fh
->
(
"$path\t$path_name\t$path_class\t$mutSampleCnt\t$totalMutGenes\t"
,
"$pval\t"
,
$fdr_hash
{
$pval
},
"\n"
);
}
}
$out_detailed_fh
->
close
;
$out_fh
->
close
;
return
1;
}
# Reads files for each sample which are formatted as tab-separated lines each showing the number of # bases with sufficient coverage in a gene. sub read_CoverageFiles { my ( $covg_dir, $all_samples_ref, $gene_sample_cov_hash_ref ) = ( $_[0], $_[1], $_[2] );
# Read per-gene covered base counts for each sample
foreach
my
$sample
( @{
$all_samples_ref
} )
{
# If the file doesn't exist, quit with error. The Music::Bmr::CalcCovg step is incomplete
unless
( -s
"$covg_dir/$sample.covg"
)
{
STDERR
"Couldn't find $sample.covg in $covg_dir. (music bmr calc-covg possibly incomplete)\n"
;
exit
1;
}
my
$covgFh
= IO::File->new(
"$covg_dir/$sample.covg"
);
while
(
my
$line
=
$covgFh
->getline )
{
next
if
(
$line
=~ m/^
#/ );
my
(
$gene
,
undef
,
$covd_bases
) =
split
( /\t/,
$line
);
$gene_sample_cov_hash_ref
->{
$gene
}{
$sample
} =
$covd_bases
;
}
$covgFh
->
close
;
}
}
1;
1 POD Error
The following errors were encountered while parsing the POD:
- Around line 118:
=back doesn't take any parameters, but you said =back EOS ); }