- --pathway-file
-
- This is a tab-delimited file prepared from a pathway database (such as KEGG), with the columns: [path_id, path_name, class, gene_line, diseases, drugs, description] The latter three columns are optional (but are available on KEGG). The gene_line contains the "entrez_id:gene_name" of all genes involved in this pathway, each separated by a "|" symbol.
-
For example, a line in the pathway-file would look like:
hsa00061 Fatty acid biosynthesis Lipid Metabolism 31:ACACA|32:ACACB|27349:MCAT|2194:FASN|54995:OXSM|55301:OLAH
Ensure that the gene names and entrez IDs used match those used in the MAF file. Entrez IDs are not mandatory (use a 0 if Entrez ID unknown). But if a gene name in the MAF does not match any gene name in this file, the entrez IDs are used to find a match (unless it's a 0).
- --gene-covg-dir
- --bam-list
-
- Provide a file containing sample names and normal/tumor BAM locations for each. Use the tab- delimited format [sample_name normal_bam tumor_bam] per line. This tool only needs sample_name, so all other columns can be skipped. The sample_name must be the same as the tumor sample names used in the MAF file (16th column, with the header Tumor_Sample_Barcode).
- --bmr
- --genes-to-ignore
sub _doc_authors { return <<EOS Michael Wendl, Ph.D. EOS }
sub _doc_credits { return <<EOS This module uses reformatted copies of data from the Kyoto Encyclopedia of Genes and Genomes (KEGG) database:
* KEGG - http://www.genome.jp/kegg/
EOS
}
sub execute { my $self = shift; my $covg_dir = $self->gene_covg_dir; my $bam_list = $self->bam_list; my $pathway_file = $self->pathway_file; my $maf_file = $self->maf_file; my $output_file = $self->output_file; my $bgd_mut_rate = $self->bmr; my $genes_to_ignore = $self->genes_to_ignore; my $min_mut_genes_per_path = $self->min_mut_genes_per_path; my $skip_non_coding = $self->skip_non_coding; my $skip_silent = $self->skip_silent;
# Check on all the input data before starting work
print STDERR "MAF file not found or is empty: $maf_file\n" unless( -s $maf_file );
print STDERR "Directory with gene coverages not found: $covg_dir\n" unless( -e $covg_dir );
print STDERR "List of samples not found or is empty: $bam_list\n" unless( -s $bam_list );
print STDERR "Pathway info file not found or is empty: $pathway_file\n" unless( -s $pathway_file );
return undef unless( -s $maf_file && -e $covg_dir && -s $bam_list && -s $pathway_file );
# Build a hash to quickly lookup the genes whose mutations should be ignored
my %ignored_genes = ();
if( defined $genes_to_ignore )
{
%ignored_genes = map { $_ => 1 } split( /,/, $genes_to_ignore );
}
# PathScan uses a helluva lot of hashes - all your RAM are belong to it
my %sample_gene_hash; # sample => array of genes (based on maf)
my %gene_path_hash; # gene => array of pathways (based on path_file)
my %path_hash; # pathway => all the information about the pathways in the database
my %sample_path_hash; # sample => pathways (based on %sample_gene_hash and %gene_path_hash)
my %path_sample_hits_hash; # path => sample => hits,mutated_genes
my %gene_sample_cov_hash; # gene => sample => coverage
my @all_sample_names; # names of all the samples, no matter if it's mutated or not
my %id_gene_hash; # entrez id => gene (based on first two columns in MAF)
# Parse out the names of the samples which should match the names in the MAF file
my $sampleFh = IO::File->new( $bam_list ) or die "Couldn't open $bam_list. $!\n";
while( my $line = $sampleFh->getline )
{
next if ( $line =~ m/^#/ );
chomp( $line );
my ( $sample ) = split( /\t/, $line );
push( @all_sample_names, $sample );
}
$sampleFh->close;
# Read coverage data calculated by the Music::Bmr::CalcCovg
$covg_dir =~ s/(\/)+$//; # Remove trailing forward slashes if any
read_CoverageFiles( $covg_dir, \@all_sample_names, \%gene_sample_cov_hash );
#build gene => average_coverage hash for population test
my %gene_cov_hash;
foreach my $gene ( keys %gene_sample_cov_hash )
{
my $total_cov = 0;
my $sample_num = scalar( @all_sample_names );
$total_cov += $gene_sample_cov_hash{$gene}{$_} foreach( @all_sample_names );
$gene_cov_hash{$gene} = int( $total_cov / $sample_num );
}
#build %sample_gene_hash based on maf
my $maf_fh = IO::File->new( $maf_file );
while( my $line = $maf_fh->getline )
{
next if( $line =~ m/^(#|Hugo_Symbol)/ );
chomp( $line );
my @cols = split( /\t/, $line );
my ( $gene, $entrez_id, $mutation_class, $tumor_sample ) = ( $cols[0], $cols[1], $cols[8], $cols[15] );
# If the mutation classification is odd, quit with error
if( $mutation_class !~ m/^(Missense_Mutation|Nonsense_Mutation|Nonstop_Mutation|Splice_Site|Translation_Start_Site|Frame_Shift_Del|Frame_Shift_Ins|In_Frame_Del|Silent|In_Frame_Ins|Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region|De_novo_Start_InFrame|De_novo_Start_OutOfFrame)$/ )
{
print STDERR "Unrecognized Variant_Classification $mutation_class in MAF file.\n";
print STDERR "Please use TCGA MAF Specification v2.3.\n";
return undef;
}
# If user wants, skip Silent mutations, or those in Introns, RNA, UTRs, Flanks, IGRs, or the ubiquitous Targeted_Region
if(( $skip_non_coding && $mutation_class =~ m/^(Intron|RNA|3'Flank|3'UTR|5'Flank|5'UTR|IGR|Targeted_Region)$/ ) ||
( $skip_silent && $mutation_class =~ m/^Silent$/ ))
{
print STDERR "Skipping $mutation_class mutation in gene $gene.\n";
next;
}
# Check that the user followed instructions and named each sample correctly
unless( grep( /^$tumor_sample$/, @all_sample_names ))
{
print STDERR "Sample $tumor_sample in MAF file does not match any in $bam_list\n";
return undef;
}
next if( defined $ignored_genes{$gene} ); # Ignore variants in genes that user wants ignored
$id_gene_hash{$entrez_id} = $gene unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ );
push( @{$sample_gene_hash{$tumor_sample}}, $gene ) unless( grep /^$gene$/, @{$sample_gene_hash{$tumor_sample}} );
}
$maf_fh->close;
my $path_fh = IO::File->new( $pathway_file );
while( my $line = $path_fh->getline )
{
chomp( $line );
next if( $line =~ /^(#|ID)/ ); #Skip headers
my ( $path_id, $name, $class, $gene_line, $diseases, $drugs, $description ) = split( /\t/, $line );
my @genes = split( /\|/, $gene_line ); #Each gene is in the format "EntrezID:GeneSymbol"
$diseases =~ s/\|/, /g; #Change the separators to commas
$drugs =~ s/\|/, /g; #Change the separators to commas
$path_hash{$path_id}{name} = $name unless( $name eq '' );
$path_hash{$path_id}{class} = $class unless( $class eq '' );
$path_hash{$path_id}{diseases} = $diseases unless( $diseases eq '' );
$path_hash{$path_id}{drugs} = $drugs unless( $drugs eq '' );
$path_hash{$path_id}{description} = $description unless( $description eq '' );
@{$path_hash{$path_id}{gene}} = ();
foreach my $gene ( @genes )
{
my ( $entrez_id, $gene_symbol ) = split( /:/, $gene );
unless( $entrez_id eq '' or $entrez_id == 0 or $entrez_id !~ m/^\d+$/ )
{
# Use the gene name from the MAF file if the entrez ID matches
$gene_symbol = $id_gene_hash{$entrez_id} if( defined $id_gene_hash{$entrez_id} );
}
push( @{$gene_path_hash{$gene_symbol}}, $path_id ) unless( grep /^$path_id$/, @{$gene_path_hash{$gene_symbol}} );
unless( grep /^$gene_symbol$/, @{$path_hash{$path_id}{gene}} )
{
push( @{$path_hash{$path_id}{gene}}, $gene_symbol );
}
}
}
$path_fh->close;
#build a sample => pathway hash
foreach my $sample ( keys %sample_gene_hash )
{
foreach my $gene ( @{$sample_gene_hash{$sample}} )
{
if( defined $gene_path_hash{$gene} )
{
foreach my $pathway ( @{$gene_path_hash{$gene}} )
{
push( @{$sample_path_hash{$sample}}, $pathway ) unless( grep /^$pathway$/, @{$sample_path_hash{$sample}} );
}
}
}
}
#build path_sample_hits_hash, for population test
foreach my $sample ( keys %sample_path_hash )
{
foreach my $path ( @{$sample_path_hash{$sample}} )
{
my $hits = 0;
my @mutated_genes = (); #Mutated genes in this sample belonging to this pathway
my @mutated_genes_in_sample = @{$sample_gene_hash{$sample}};
foreach my $gene ( @{$path_hash{$path}{gene}} )
{
if( grep /^$gene$/, @mutated_genes_in_sample ) #if this gene is mutated in this sample (in maf)
{
$hits++;
push( @mutated_genes, $gene );
}
}
if( $hits > 0 )
{
$path_sample_hits_hash{$path}{$sample}{hits} = $hits;
$path_sample_hits_hash{$path}{$sample}{mutated_genes} = \@mutated_genes;
}
}
}
#Calculation of p value
my %data; #For printing
my @pvals;
foreach my $path ( sort keys %path_hash )
{
my @pathway_genes = @{$path_hash{$path}{gene}};
my @gene_sizes = ();
foreach my $gene ( @pathway_genes )
{
if( defined $gene_cov_hash{$gene} )
{
my $avg_cov = int( $gene_cov_hash{$gene} );
push( @gene_sizes, $avg_cov ) if( $avg_cov > 3 );
}
}
#If this pathway doesn't have any gene coverage, skip it
next unless( scalar( @gene_sizes ) > 0 );
my @num_hits_per_sample; #store hits info for each patient
my @mutated_samples = sort keys %{$path_sample_hits_hash{$path}};
foreach my $sample ( @all_sample_names )
{
my $hits = 0;
#if this sample has mutation
if( grep /^$sample$/, @mutated_samples )
{
$hits = $path_sample_hits_hash{$path}{$sample}{hits};
}
push( @num_hits_per_sample, $hits );
}
#If this pathway doesn't have any mutated genes in any samples, skip it
next unless( scalar( @num_hits_per_sample ) > 0 );
my $hits_ref = \@num_hits_per_sample;
########### MCW ADDED
# FIND MAX NUMBER OF HITS IN A SAMPLE
my $max_hits = 0;
foreach my $hits_in_sample ( @num_hits_per_sample )
{
$max_hits = $hits_in_sample if( $hits_in_sample > $max_hits );
}
########### MCW ADDED
my $pop_obj = Genome::Model::Tools::Music::PathScan::PopulationPathScan->new( \@gene_sizes );
if( scalar( @gene_sizes ) >= 3 )
{
########### MCW ADDED
if( $max_hits > 15 )
{
$pop_obj->assign( 5 );
}
else
{
$pop_obj->assign( 3 );
}
########### MCW ADDED
#$pop_obj->assign(3);
}
elsif( @gene_sizes == 2 )
{
$pop_obj->assign( 2 );
}
else
{
$pop_obj->assign( 1 );
}
$pop_obj->preprocess( $bgd_mut_rate, $hits_ref ); #mwendl's new fix
my $pval = $pop_obj->population_pval_approx($hits_ref);
$data{$pval}{$path}{samples} = \@mutated_samples;
$data{$pval}{$path}{hits} = $hits_ref;
push( @pvals, $pval ); # For calculation of FDR
}
# Calculate False Discovery Rates (Benjamini-Hochberg FDR) for the p-values
my $pval_cnt = scalar( @pvals );
my %fdr_hash;
for( my $i = 0; $i < $pval_cnt; $i++ )
{
my $fdr = $pvals[$i] * $pval_cnt / ( $pval_cnt - $i );
$fdr = 1 if $fdr > 1;
$fdr_hash{$pvals[$i]} = $fdr;
}
# Print two output files, one more detailed than the other
my $out_fh = IO::File->new( $output_file, ">" );
my $out_detailed_fh = IO::File->new( "$output_file\_detailed", ">" );
$out_fh->print( "Pathway\tName\tClass\tSamples_Affected\tTotal_Variations\tp-value\tFDR\n" );
foreach my $pval ( sort { $a <=> $b } keys %data )
{
foreach my $path ( sort keys %{$data{$pval}} )
{
# Skip this pathway if it has fewer affected genes than the user wants
my %mutated_gene_hash;
my @samples = @{$data{$pval}{$path}{samples}};
foreach my $sample ( @samples )
{
foreach my $gene ( @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}} )
{
$mutated_gene_hash{$gene}++;
}
}
next unless ( scalar( keys %mutated_gene_hash ) >= $min_mut_genes_per_path );
# Print detailed output to a separate output file
$out_detailed_fh->print( "Pathway: $path\n" );
$out_detailed_fh->print( "Name: ", $path_hash{$path}{name}, "\n" ) if( defined $path_hash{$path}{name} );
$out_detailed_fh->print( "Class: ", $path_hash{$path}{class}, "\n" ) if( defined $path_hash{$path}{class} );
$out_detailed_fh->print( "Diseases: ", $path_hash{$path}{diseases}, "\n" ) if( defined $path_hash{$path}{diseases} );
$out_detailed_fh->print( "Drugs: ", $path_hash{$path}{drugs}, "\n" ) if( defined $path_hash{$path}{drugs} );
$out_detailed_fh->print( "P-value: $pval\n", "FDR: ", $fdr_hash{$pval}, "\n" );
$out_detailed_fh->print( "Description: ", $path_hash{$path}{description}, "\n" );
my @hits = @{$data{$pval}{$path}{hits}};
foreach my $sample ( @samples )
{
my @mutated_genes = @{$path_sample_hits_hash{$path}{$sample}{mutated_genes}};
$out_detailed_fh->print( "$sample:" );
$out_detailed_fh->print( join ",", @mutated_genes );
$out_detailed_fh->print( "\n" );
}
my ( $mutSampleCnt, $totalMutGenes ) = ( 0, 0 );
$out_detailed_fh->print( "Samples with mutations (#hits): " );
for( my $i = 0; $i < scalar( @all_sample_names ); ++$i )
{
if( $hits[$i] > 0 )
{
$out_detailed_fh->print( "$all_sample_names[$i]($hits[$i]) " );
$mutSampleCnt++;
$totalMutGenes += $hits[$i];
}
}
$out_detailed_fh->print( "\n\n" );
# Print tabulated output to the main output file
my ( $path_name, $path_class ) = ( "-", "-" );
$path_name = $path_hash{$path}{name} if( defined $path_hash{$path}{name} );
$path_class = $path_hash{$path}{class} if( defined $path_hash{$path}{class} );
$out_fh->print( "$path\t$path_name\t$path_class\t$mutSampleCnt\t$totalMutGenes\t",
"$pval\t", $fdr_hash{$pval}, "\n" );
}
}
$out_detailed_fh->close;
$out_fh->close;
return 1;
}
# Reads files for each sample which are formatted as tab-separated lines each showing the number of # bases with sufficient coverage in a gene. sub read_CoverageFiles { my ( $covg_dir, $all_samples_ref, $gene_sample_cov_hash_ref ) = ( $_[0], $_[1], $_[2] );
# Read per-gene covered base counts for each sample
foreach my $sample ( @{$all_samples_ref} )
{
# If the file doesn't exist, quit with error. The Music::Bmr::CalcCovg step is incomplete
unless( -s "$covg_dir/$sample.covg" )
{
print STDERR "Couldn't find $sample.covg in $covg_dir. (music bmr calc-covg possibly incomplete)\n";
exit 1;
}
my $covgFh = IO::File->new( "$covg_dir/$sample.covg" );
while( my $line = $covgFh->getline )
{
next if( $line =~ m/^#/ );
my ( $gene, undef, $covd_bases ) = split( /\t/, $line );
$gene_sample_cov_hash_ref->{$gene}{$sample} = $covd_bases;
}
$covgFh->close;
}
}
1;
1 POD Error
The following errors were encountered while parsing the POD:
- Around line 118:
=back doesn't take any parameters, but you said =back EOS ); }