#!/usr/local/bin/perl
our
$VERSION
=
'0.07'
;
my
$pos
= 0;
my
$ARGC
;
my
%chromosome_length
= (
X
=> 154913754,
Y
=> 57741652,
1
=> 247199719,
2
=> 242751149,
3
=> 199446827,
4
=> 191263063,
5
=> 180837866,
6
=> 170896993,
7
=> 158821424,
8
=> 146274826,
9
=> 140442298,
10
=> 135374737,
11
=> 134452384,
12
=> 132289534,
13
=> 114127980,
14
=> 106360585,
15
=> 100338915,
16
=> 88822254,
17
=> 78654742,
18
=> 76117153,
19
=> 63806651,
20
=> 62435965,
21
=> 46944323,
22
=> 49528953
);
my
$TP53_start
= 1000000*7.57;
my
$TP53_end
= 1000000*7.59;
my
$insertion_data_present
= 0;
my
$LOH_data_present
= 0;
our
$bin_size
;
our
$localization_window_size
;
our
$expected_mutation_density
;
our
$low_mutation_density_threshold
;
our
$collapse_regions
;
our
$outlier_deviation
;
our
$translocation_cut_off_count
;
our
$chromosome_localization_weight
;
our
$genome_localization_weight
;
our
$cnv_weight
;
our
$translocation_weight
;
our
$insertion_breakpoint_weight
;
our
$loh_weight
;
our
$tp53_mutated_weight
;
sub
run {
my
$argv_ref
=
shift
;
my
@argv
= @{
$argv_ref
};
my
$cnv_directory
;
my
$trans_directory
;
my
$insertion_directory
;
my
$loh_directory
;
my
$output_directory
;
my
$config_file_path
;
my
$tp53_mutated
= 0;
my
$tp53_mutation_found
= 0;
my
@cnv_files
;
my
@trans_files
;
my
@insertion_files
;
my
@loh_files
;
my
$chromosome_copy_number_count_hash_ref
;
my
$chromosome_cnv_breakpoints_hash_ref
;
my
$chromosome_translocation_count_hash_ref
;
my
$chromosome_insertion_count_hash_ref
;
my
$chromosome_loh_breakpoints_hash_ref
;
my
$genome_trans_breakpoints_hash_ref
;
my
$genome_trans_insertion_breakpoints_hash_ref
;
my
$genome_mutation_density_hash_ref
;
my
$genome_cnv_data_hash_ref
= initialize_genome_hash();
my
$genome_trans_data_hash_ref
= initialize_genome_hash();
my
$genome_insertion_data_hash_ref
= initialize_genome_hash();
my
$genome_cnv_data_windows_hash_ref
;
my
$genome_trans_data_windows_hash_ref
;
my
$genome_mutation_data_windows_hash_ref
;
my
$suspect_regions_array_ref
;
my
$likely_regions_array_ref
;
validate_input(\
@argv
, \
$cnv_directory
, \
$trans_directory
, \
$insertion_directory
, \
$loh_directory
, \
$tp53_mutated
, \
$output_directory
, \
$config_file_path
);
load_config_file(
$config_file_path
);
print
"CNV dir:\t$cnv_directory\n"
;
print
"Trans dir:\t$trans_directory\n"
;
if
(
defined
(
$insertion_directory
)){
print
"insertion dir:\t$insertion_directory\n"
;
}
if
(
defined
(
$loh_directory
)){
print
"LOH dir:\t$loh_directory\n"
;
}
print
"Output dir:\t$output_directory\n"
;
print
"Force TP53 Mutation:\t$tp53_mutated\n\n"
;
@cnv_files
=
glob
(
"$cnv_directory"
.
"*.spc"
);
@trans_files
=
glob
(
"$trans_directory"
.
"*.spt"
);
if
(
scalar
(
@cnv_files
)==0 ||
scalar
(
@trans_files
)==0){
die
"ERROR: no CNV or translocation input files found\n"
;
}
if
(
defined
(
$insertion_directory
)){
@insertion_files
=
glob
(
"$insertion_directory"
.
"*.vcf"
);
}
if
(
defined
(
$loh_directory
)){
@loh_files
=
glob
(
"$loh_directory"
.
"*.spl"
);
}
$
" = "
\n\t\t";
print
"CNV files:\t@cnv_files\n\n"
;
print
"Trans files:\t@trans_files\n\n"
;
$
" = "
\n\t\t";
if
(
scalar
(
@insertion_files
)==0){
print
"Indel files:\t-none\n\n"
;
}
else
{
print
"Indel files:\t@insertion_files\n\n"
;
}
$
" = "
\n\t";
if
(
scalar
(
@loh_files
)==0){
print
"LOH files:\t-none\n\n"
;
}
else
{
print
"LOH files:\t@loh_files\n\n"
;
}
$
" = "
";
mkdir
(
"$output_directory"
,0770)
unless
(-d
"$output_directory"
);
if
(!(-e
$output_directory
)){
die
"ERROR: could not create directory: $output_directory\n"
;
}
print
"\n--analyzing CNV data\n"
;
(
$genome_cnv_data_hash_ref
,
$chromosome_copy_number_count_hash_ref
,
$chromosome_cnv_breakpoints_hash_ref
) = analyze_cnv_data(
$output_directory
, \
@cnv_files
,
$bin_size
, \
$tp53_mutation_found
);
print
"---done analyzing CNV data\n\n"
;
print
"--analyzing translocation data\n"
;
(
$genome_trans_data_hash_ref
,
$chromosome_translocation_count_hash_ref
,
$genome_trans_breakpoints_hash_ref
) = analyze_trans_data(
$output_directory
, \
@trans_files
,
$bin_size
, \
$tp53_mutation_found
);
print
"---done analyzing translocation data\n\n"
;
if
(
defined
(
$insertion_directory
)){
print
"--analyzing insertion data\n"
;
(
$genome_insertion_data_hash_ref
,
$chromosome_insertion_count_hash_ref
,
$genome_trans_insertion_breakpoints_hash_ref
) = analyze_insertion_data(
$output_directory
, \
@insertion_files
,
$bin_size
,
$genome_trans_breakpoints_hash_ref
, \
$tp53_mutation_found
);
print
"---done analyzing insertion data\n\n"
;
}
%$genome_trans_breakpoints_hash_ref
= ();
undef
$genome_trans_breakpoints_hash_ref
;
if
(
defined
(
$loh_directory
)){
print
"--analyzing LOH data\n"
;
(
$chromosome_loh_breakpoints_hash_ref
) = analyze_loh_data(
$output_directory
, \
@loh_files
, \
$tp53_mutation_found
);
print
"---done analyzing LOH data\n\n"
;
my
%loh_hash
= %{
$chromosome_loh_breakpoints_hash_ref
};
for
my
$key1
(
keys
%loh_hash
){
my
@a
= @{
$loh_hash
{
$key1
}};
my
$size
=
@a
;
if
(
$size
% 2 != 0){
die
"ERROR: odd number of loh breakpoints recorded for chromosome $key1\n"
;
}
}
}
print
"--calculating chromosome mutation densities\n"
;
$genome_mutation_density_hash_ref
= calculate_genome_localization(
$output_directory
,
$chromosome_copy_number_count_hash_ref
,
$chromosome_translocation_count_hash_ref
);
print
"---done calculating chromosome mutation densities\n\n"
;
print
"--calculating chromosome region mutation densities\n"
;
(
$suspect_regions_array_ref
,
$likely_regions_array_ref
,
$genome_cnv_data_windows_hash_ref
,
$genome_trans_data_windows_hash_ref
,
$genome_mutation_data_windows_hash_ref
) = calculate_chromosome_localization(
$output_directory
,
$genome_cnv_data_hash_ref
,
$genome_trans_data_hash_ref
,
$bin_size
,
$localization_window_size
);
print
"---done calculating chromosome region mutation densities\n\n"
;
print
"--analyzing suspect regions\n"
;
analyze_suspect_regions(
$output_directory
,
$suspect_regions_array_ref
,
$genome_mutation_density_hash_ref
,
$genome_cnv_data_hash_ref
,
$genome_trans_data_hash_ref
,
$genome_trans_insertion_breakpoints_hash_ref
,
$bin_size
,
$localization_window_size
,
$tp53_mutated
,
$tp53_mutation_found
,
$chromosome_cnv_breakpoints_hash_ref
,
$chromosome_loh_breakpoints_hash_ref
);
print
"---done analyzing suspect regions\n\n"
;
print
"--analyzing likely regions\n"
;
analyze_likely_regions(
$output_directory
,
$likely_regions_array_ref
,
$genome_mutation_density_hash_ref
,
$genome_cnv_data_hash_ref
,
$genome_trans_data_hash_ref
,
$bin_size
,
$localization_window_size
);
print
"---done analyzing likely regions\n\n"
;
print
"--calculating copy number count\n"
;
check_copy_number_count(
$output_directory
,
$chromosome_copy_number_count_hash_ref
);
print
"---done calculating copy number count\n\n"
;
print
"--calculating switch count\n"
;
check_copy_number_switches(
$output_directory
,
$chromosome_copy_number_count_hash_ref
);
print
"---done calculating switch count\n\n"
;
print
"--calculating interchromosomal translocation rate\n"
;
calculate_interchromosomal_translocation_rate(
$output_directory
,
$chromosome_translocation_count_hash_ref
);
print
"---done calculating interchromosomal translocation rate\n"
;
}
sub
validate_input {
my
$argv_ref
=
shift
;
my
@argv
= @{
$argv_ref
};
my
$cnv_directory_ref
=
shift
;
my
$trans_directory_ref
=
shift
;
my
$insertion_directory_ref
=
shift
;
my
$loh_directory_ref
=
shift
;
my
$tp53_mutated_ref
=
shift
;
my
$output_directory_ref
=
shift
;
my
$config_file_path_ref
=
shift
;
$ARGC
=
@argv
;
switch(
$ARGC
){
case 0 { usage(0); }
case 1 {
if
(
$argv
[0] eq
"--help"
){
man_text();
}
else
{
usage(1);
}
}
else
{
if
(
$argv
[
$pos
] eq
"--cnv"
){
next_arg(2);
$$cnv_directory_ref
=
$argv
[
$pos
];
if
(!(
substr
(
$$cnv_directory_ref
,-1,1) eq
'/'
)){
$$cnv_directory_ref
=
$$cnv_directory_ref
.
'/'
;
}
next_arg(3);
}
else
{
usage(4);
}
if
(
$argv
[
$pos
] eq
"--trans"
){
next_arg(5);
$$trans_directory_ref
=
$argv
[
$pos
];
if
(!(
substr
(
$$trans_directory_ref
,-1,1) eq
'/'
)){
$$trans_directory_ref
=
$$trans_directory_ref
.
'/'
;
}
next_arg(6);
}
else
{
usage(7);
}
if
(
$argv
[
$pos
] eq
"--insrt"
){
next_arg(8);
$$insertion_directory_ref
=
$argv
[
$pos
];
if
(!(
substr
(
$$insertion_directory_ref
,-1,1) eq
'/'
)){
$$insertion_directory_ref
=
$$insertion_directory_ref
.
'/'
;
}
$insertion_data_present
= 1;
next_arg(9);
}
if
(
$argv
[
$pos
] eq
"--loh"
){
next_arg(10);
$$loh_directory_ref
=
$argv
[
$pos
];
if
(!(
substr
(
$$loh_directory_ref
,-1,1) eq
'/'
)){
$$loh_directory_ref
=
$$loh_directory_ref
.
'/'
;
}
$LOH_data_present
= 1;
next_arg(11);
}
if
(
$argv
[
$pos
] eq
"--tp53"
){
$$tp53_mutated_ref
= 1;
next_arg(12);
}
if
(
$argv
[
$pos
] eq
"--config"
){
next_arg(13);
$$config_file_path_ref
=
$argv
[
$pos
];
next_arg(14);
}
else
{
usage(15);
}
if
(
$argv
[
$pos
] eq
"--output"
){
next_arg(16);
$$output_directory_ref
=
$argv
[
$pos
];
if
(!(
substr
(
$$output_directory_ref
,-1,1) eq
'/'
)){
$$output_directory_ref
=
$$output_directory_ref
.
'/'
;
}
}
else
{
usage(17);
}
if
(
$pos
==
$ARGC
-1){
last
;
}
else
{
usage(18);
}
}
}
}
sub
analyze_cnv_data {
my
$output_directory
=
shift
;
my
$cnv_files_array_ref
=
shift
;
my
@cnv_files
=
@$cnv_files_array_ref
;
my
$bin_size
=
shift
;
my
$tp53_mutation_found_ref
=
shift
;
my
%genome_cnv_data
= ();
my
@file_data
;
my
$CURRENT_FILE
;
my
$TP53_FILE
;
my
$line
;
my
@line_data
;
my
%chromosome_copy_number_count
= ();
my
%chromsome_cnv_breakpoints
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
foreach
my
$file
(
@cnv_files
){
open
(
$CURRENT_FILE
,
"<"
,
$file
) or
die
"ERROR: could not open file at path $file\n"
;
if
(
eof
(
$CURRENT_FILE
)){
close
(
$CURRENT_FILE
);
die
"ERROR: $file is empty\n"
;
}
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
if
(!(
$line
=~ m/^
close
(
$CURRENT_FILE
);
die
"ERROR: header of cnv file $file is invalid\n"
;
}
while
( !(
eof
(
$CURRENT_FILE
)) ){
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
if
(!(
$line
=~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
die
"ERROR: invalid line found ($line) in file $file\n"
;
}
@line_data
= (
split
(/\t/,
$line
));
push
(
@file_data
,[
@line_data
]);
}
close
(
$CURRENT_FILE
);
}
@file_data
= check_for_overlaps(
"cnv"
, \
@file_data
);
mkdir
(
"$output_directory"
.
"TP53"
,0770)
unless
(-d
"$output_directory"
.
"TP53"
);
if
(!(-e
"$output_directory"
.
"TP53"
)){
die
"ERROR: could not create folder $output_directory"
.
"TP53\n"
;
}
open
(
$TP53_FILE
,
">"
,
"$output_directory"
.
"TP53/TP53.spc"
) or
die
"ERROR: could not create file: $output_directory"
.
"TP53/TP53.spc"
;
print
$TP53_FILE
"#chr\tstart\tend\tnumber\tquality"
;
for
(
my
$n
= 0;
$n
<
scalar
(
@file_data
);
$n
++){
my
$hash
= {};
if
(!(
$file_data
[
$n
][0] =~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
die
"WARNING: invalid chromosome field detected, line skipped: @{$file_data[$n]}\n"
;
}
my
$chr
= $2;
$chromosome_copy_number_count
{
$chr
}{
$file_data
[
$n
][3]}++;
push
(@{
$chromsome_cnv_breakpoints
{
$chr
}}, (
$file_data
[
$n
][1],
$file_data
[
$n
][2]));
my
$start_index
=
int
(
$file_data
[
$n
][1]/
$bin_size
);
my
$end_index
=
int
(
$file_data
[
$n
][2]/
$bin_size
);
my
$update_bin
=
sub
{
my
$source_chr
=
shift
;
my
$index
=
shift
;
my
$copy_num
=
shift
;
my
$genome_hash_ref
=
shift
;
my
%genome_hash
= %{
$genome_hash_ref
};
my
$hash
= ();
if
(!
defined
(@{
$genome_hash
{
$source_chr
}}[
$index
])){
$hash
->{
'BPcount'
} = 1;
$hash
->{
$copy_num
} = 0.5;
@{
$genome_hash
{
$source_chr
}}[
$index
] =
$hash
;
}
else
{
${@{
$genome_hash
{
$source_chr
}}[
$index
]}{
'BPcount'
}++;
${@{
$genome_hash
{
$source_chr
}}[
$index
]}{
$copy_num
}+=0.5;
}
};
if
(!
defined
(@{
$genome_cnv_data
{
$chr
}}[
$start_index
])){
$hash
->{
'BPcount'
} = 1;
$hash
->{
$file_data
[
$n
][3]} = 0.5;
@{
$genome_cnv_data
{
$chr
}}[
$start_index
] =
$hash
;
}
else
{
${@{
$genome_cnv_data
{
$chr
}}[
$start_index
]}{
'BPcount'
}++;
${@{
$genome_cnv_data
{
$chr
}}[
$start_index
]}{
$file_data
[
$n
][3]}+=0.5;
}
$hash
= {};
if
(!
defined
(@{
$genome_cnv_data
{
$chr
}}[
$end_index
])){
$hash
->{
'BPcount'
} = 1;
$hash
->{
$file_data
[
$n
][3]} = 0.5;
@{
$genome_cnv_data
{
$chr
}}[
$end_index
] =
$hash
;
}
else
{
${@{
$genome_cnv_data
{
$chr
}}[
$end_index
]}{
'BPcount'
}++;
${@{
$genome_cnv_data
{
$chr
}}[
$end_index
]}{
$file_data
[
$n
][3]}+=0.5;
}
if
(
(
$chr
ne
'X'
&&
$chr
ne
'Y'
) &&
(
$chr
==17 )
){
if
(
(
$file_data
[
$n
][3] != 2 ) &&
( (
$file_data
[
$n
][1] >=
$TP53_start
&&
$file_data
[
$n
][1] <=
$TP53_end
) || (
$file_data
[
$n
][2] >=
$TP53_start
&&
$file_data
[
$n
][2] <=
$TP53_end
) )
){
$$tp53_mutation_found_ref
= 1;
print
$TP53_FILE
"\n"
;
for
(
my
$i
= 0;
$i
<
scalar
(@{
$file_data
[
$n
]});
$i
++){
print
$TP53_FILE
"$file_data[$n][$i]"
;
if
(
$i
!=
scalar
(@{
$file_data
[
$n
]})-1){
print
$TP53_FILE
"\t"
;
}
}
}
}
}
close
(
$TP53_FILE
);
return
(\
%genome_cnv_data
, \
%chromosome_copy_number_count
, \
%chromsome_cnv_breakpoints
);
}
sub
check_for_overlaps {
my
$type
=
shift
;
my
$file_data_ref
=
shift
;
my
@file_data
=
@$file_data_ref
;
my
$start_overlap
= 0;
my
$end_overlap
= 0;
for
(
my
$n
= 0;
$n
<
scalar
(
@file_data
);
$n
++){
for
(
my
$k
=
$n
+1;
$k
<
scalar
(
@file_data
);
$k
++){
if
(
$file_data
[
$n
][0] eq
$file_data
[
$k
][0]) {
if
(
$file_data
[
$n
][2]>=
$file_data
[
$k
][1] &&
$file_data
[
$n
][2]<=
$file_data
[
$k
][2]){
$end_overlap
= 1;
}
if
(
$file_data
[
$n
][1]>=
$file_data
[
$k
][1] &&
$file_data
[
$n
][1]<=
$file_data
[
$k
][2]){
$start_overlap
= 1;
}
if
(
$start_overlap
==1 ||
$end_overlap
==1) {
if
(
$type
eq
"trans"
){
die
"ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
if
(
$type
eq
"loh"
){
die
"ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
if
(
(
$type
eq
"cnv"
) &&
(
$file_data
[
$n
][3] !=
$file_data
[
$k
][3] )
){
die
"ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
elsif
(
$collapse_regions
==0) {
die
"ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
elsif
(
$collapse_regions
==1) {
if
(
$start_overlap
==1 &&
$end_overlap
==1){
$file_data
[
$n
][1] =
$file_data
[
$k
][1];
$file_data
[
$n
][2] =
$file_data
[
$k
][2];
}
elsif
(
$start_overlap
==1){
$file_data
[
$n
][1] =
$file_data
[
$k
][1];
}
elsif
(
$end_overlap
==1){
$file_data
[
$n
][2] =
$file_data
[
$k
][2];
}
if
(
$start_overlap
==1 ||
$end_overlap
==1){
@file_data
= (
@file_data
[0..(
$k
-1),(
$k
+1)..(
scalar
(
@file_data
)-1)]);
$start_overlap
= 0;
$end_overlap
= 0;
$k
=
$n
+1;
redo
;
}
}
}
elsif
(
$file_data
[
$n
][1]<=
$file_data
[
$k
][1] &&
$file_data
[
$n
][2]>=
$file_data
[
$k
][2]){
if
(
$type
eq
"trans"
){
die
"ERROR: found overlapping translocation source regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
if
(
$type
eq
"loh"
){
die
"ERROR: found overlapping LOH regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
if
(
(
$type
eq
"cnv"
) &&
(
$file_data
[
$n
][3] !=
$file_data
[
$k
][3] )
){
die
"ERROR: found overlapping regions with different copy number values:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
elsif
(
$collapse_regions
==0) {
die
"ERROR: found overlapping copy number regions:\n\t@{$file_data[$n]}\n\t@{$file_data[$k]}\n"
;
}
elsif
(
$collapse_regions
==1) {
@file_data
= (
@file_data
[0..(
$k
-1),(
$k
+1)..(
scalar
(
@file_data
)-1)]);
}
}
}
}
}
return
(
@file_data
);
}
sub
analyze_trans_data {
my
$output_directory
=
shift
;
my
$trans_files_array_ref
=
shift
;
my
@trans_files
=
@$trans_files_array_ref
;
my
$bin_size
=
shift
;
my
$tp53_mutation_found_ref
=
shift
;
my
%genome_trans_data
= ();
my
@file_data
;
my
$CURRENT_FILE
;
my
$TP53_FILE
;
my
$line
;
my
@line_data
;
my
$chr1
;
my
$chr2
;
my
%chromosome_trans_count
= ();
my
%genome_trans_breakpoints
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
foreach
my
$file
(
@trans_files
){
open
(
$CURRENT_FILE
,
"<"
,
$file
) or
die
"ERROR: could not open file at path $file\n"
;
if
(
eof
(
$CURRENT_FILE
)){
close
(
$CURRENT_FILE
);
die
"ERROR: $file is empty\n"
;
}
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
if
(!(
$line
=~ m/^
close
(
$CURRENT_FILE
);
die
"ERROR: header of translocation file $file is invalid\n"
;
}
while
( !(
eof
(
$CURRENT_FILE
)) ){
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
if
(!(
$line
=~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
next
;
}
@line_data
= (
split
(/\t/,
$line
));
if
(
(
$line_data
[1] >=
$line_data
[2] ) ||
(
$line_data
[4] >=
$line_data
[5] )
){
}
push
(
@file_data
,[
@line_data
]);
}
close
(
$CURRENT_FILE
);
}
mkdir
(
"$output_directory"
.
"TP53"
,0770)
unless
(-d
"$output_directory"
.
"TP53"
);
if
(!(-e
"$output_directory"
.
"TP53"
)){
die
"ERROR: could not create folder $output_directory"
.
"TP53\n"
;
}
open
(
$TP53_FILE
,
">"
,
"$output_directory"
.
"TP53/TP53.spt"
) or
die
"ERROR: could not create file: $output_directory"
.
"TP53/TP53.spt"
;
print
$TP53_FILE
"#chr1\tstart\tend\tchr2\tstart\tend\tquality"
;
for
(
my
$n
= 0;
$n
<
scalar
(
@file_data
);
$n
++){
my
$hash
= {};
if
(!(
$file_data
[
$n
][0] =~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
die
"ERROR: invalid chromosome field detected in translocation file\n"
;
}
my
$chr1
= $2;
if
(!(
$file_data
[
$n
][3] =~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
die
"ERROR: invalid chromosome field detected in translocation file\n"
;
}
my
$chr2
= $2;
my
$start_index1
=
int
(
$file_data
[
$n
][1]/
$bin_size
);
my
$end_index1
=
int
(
$file_data
[
$n
][2]/
$bin_size
);
my
$start_index2
=
int
(
$file_data
[
$n
][4]/
$bin_size
);
my
$end_index2
=
int
(
$file_data
[
$n
][5]/
$bin_size
);
my
$update_bin
=
sub
{
my
$source_chr
=
shift
;
my
$dest_chr
=
shift
;
my
$index
=
shift
;
my
$data
=
shift
;
my
$type
=
shift
;
my
$genome_hash_ref
=
shift
;
my
%genome_hash
= %{
$genome_hash_ref
};
my
$hash
= ();
if
(!
defined
(@{
$genome_hash
{
$source_chr
}}[
$index
])){
$hash
->{
'BPcount'
} = 1;
push
(@{
$hash
->{
$type
}{
$dest_chr
}},
$data
);
@{
$genome_hash
{
$source_chr
}}[
$index
] =
$hash
;
}
else
{
${@{
$genome_hash
{
$source_chr
}}[
$index
]}{
'BPcount'
}++;
push
(@{${@{
$genome_hash
{
$source_chr
}}[
$index
]}{
$type
}{
$dest_chr
}},
$data
);
}
};
if
(!
defined
(@{
$genome_trans_data
{
$chr1
}}[
$start_index1
])){
$hash
->{
'BPcount'
} = 1;
push
(@{
$hash
->{
'out'
}{
$chr2
}},
$file_data
[
$n
][4]);
@{
$genome_trans_data
{
$chr1
}}[
$start_index1
] =
$hash
;
}
else
{
${@{
$genome_trans_data
{
$chr1
}}[
$start_index1
]}{
'BPcount'
}++;
push
(@{${@{
$genome_trans_data
{
$chr1
}}[
$start_index1
]}{
'out'
}{
$chr2
}},
$file_data
[
$n
][4]);
}
$hash
= {};
if
(!
defined
(@{
$genome_trans_data
{
$chr1
}}[
$end_index1
])){
$hash
->{
'BPcount'
} = 1;
push
(@{
$hash
->{
'out'
}{
$chr2
}},
$file_data
[
$n
][5]);
@{
$genome_trans_data
{
$chr1
}}[
$end_index1
] =
$hash
;
}
else
{
${@{
$genome_trans_data
{
$chr1
}}[
$end_index1
]}{
'BPcount'
}++;
push
(@{${@{
$genome_trans_data
{
$chr1
}}[
$end_index1
]}{
'out'
}{
$chr2
}},
$file_data
[
$n
][5]);
}
$hash
= {};
if
(!
defined
(@{
$genome_trans_data
{
$chr2
}}[
$start_index2
])){
$hash
->{
'BPcount'
} = 1;
push
(@{
$hash
->{
'in'
}{
$chr1
}},
$file_data
[
$n
][1]);
@{
$genome_trans_data
{
$chr2
}}[
$start_index2
] =
$hash
;
}
else
{
${@{
$genome_trans_data
{
$chr2
}}[
$start_index2
]}{
'BPcount'
}++;
push
(@{${@{
$genome_trans_data
{
$chr2
}}[
$start_index2
]}{
'in'
}{
$chr1
}},
$file_data
[
$n
][1]);
}
$hash
= {};
if
(!
defined
(@{
$genome_trans_data
{
$chr2
}}[
$end_index2
])){
$hash
->{
'BPcount'
} = 1;
push
(@{
$hash
->{
'in'
}{
$chr1
}},
$file_data
[
$n
][2]);
@{
$genome_trans_data
{
$chr2
}}[
$end_index2
] =
$hash
;
}
else
{
${@{
$genome_trans_data
{
$chr2
}}[
$end_index2
]}{
'BPcount'
}++;
push
(@{${@{
$genome_trans_data
{
$chr2
}}[
$end_index2
]}{
'in'
}{
$chr1
}},
$file_data
[
$n
][2]);
}
$chromosome_trans_count
{
$chr1
}{
$chr2
}++;
if
(
$chr1
ne
$chr2
){
$chromosome_trans_count
{
$chr2
}{
$chr1
}++;
}
push
(@{
$genome_trans_breakpoints
{
$chr1
}},
$file_data
[
$n
][1]);
push
(@{
$genome_trans_breakpoints
{
$chr1
}},
$file_data
[
$n
][2]);
push
(@{
$genome_trans_breakpoints
{
$chr2
}},
$file_data
[
$n
][4]);
push
(@{
$genome_trans_breakpoints
{
$chr2
}},
$file_data
[
$n
][5]);
if
(
(
$chr1
ne
'X'
&&
$chr1
ne
'Y'
) &&
(
$chr1
==17 )
){
if
(
( (
$file_data
[
$n
][1] >=
$TP53_start
&&
$file_data
[
$n
][1] <=
$TP53_end
) || (
$file_data
[
$n
][2] >=
$TP53_start
&&
$file_data
[
$n
][2] <=
$TP53_end
) )
){
$$tp53_mutation_found_ref
= 1;
print
$TP53_FILE
"\n"
;
for
(
my
$i
= 0;
$i
<
scalar
(@{
$file_data
[
$n
]});
$i
++){
print
$TP53_FILE
"$file_data[$n][$i]"
;
if
(
$i
!=
scalar
(@{
$file_data
[
$n
]})-1){
print
$TP53_FILE
"\t"
;
}
}
}
}
if
(
(
$chr2
ne
'X'
&&
$chr2
ne
'Y'
) &&
(
$chr2
==17 )
){
if
(
( (
$file_data
[
$n
][4] >=
$TP53_start
&&
$file_data
[
$n
][4] <=
$TP53_end
) || (
$file_data
[
$n
][5] >=
$TP53_start
&&
$file_data
[
$n
][5] <=
$TP53_end
) )
){
$$tp53_mutation_found_ref
= 1;
print
$TP53_FILE
"\n"
;
for
(
my
$i
= 0;
$i
<
scalar
(@{
$file_data
[
$n
]});
$i
++){
print
$TP53_FILE
"$file_data[$n][$i]"
;
if
(
$i
!=
scalar
(@{
$file_data
[
$n
]})-1){
print
$TP53_FILE
"\t"
;
}
}
}
}
}
close
(
$TP53_FILE
);
return
(\
%genome_trans_data
, \
%chromosome_trans_count
, \
%genome_trans_breakpoints
);
}
sub
analyze_insertion_data {
my
$output_directory
=
shift
;
my
$insertion_files_array_ref
=
shift
;
my
@insertion_files
=
@$insertion_files_array_ref
;
my
$bin_size
=
shift
;
my
$genome_trans_breakpoints_hash_ref
=
shift
;
my
%genome_trans_breakpoints
=
%$genome_trans_breakpoints_hash_ref
;
my
$tp53_mutation_found_ref
=
shift
;
my
%genome_insertion_data
= ();
my
$CURRENT_FILE
;
my
$TP53_FILE
;
my
$line
;
my
@line_data
;
my
$chr
;
my
$file_name
;
my
$path
;
my
$suffix
;
my
$rm_insertion_file_result
;
my
$insertion_found
= 0;
my
%chromosome_insertion_count
= ();
my
%genome_trans_insertion_breakpoints
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
mkdir
(
"$output_directory"
.
"TP53"
,0770)
unless
(-d
"$output_directory"
.
"TP53"
);
if
(!(-e
"$output_directory"
.
"TP53"
)){
die
"ERROR: could not create folder $output_directory"
.
"TP53\n"
;
}
foreach
my
$file
(
@insertion_files
){
(
$file_name
,
$path
,
$suffix
) = File::Basename::fileparse(
$file
,
"\.[^.]*"
);
open
(
$CURRENT_FILE
,
"<"
,
$file
) or
die
"ERROR: could not open file at path $file\n"
;
if
(
eof
(
$CURRENT_FILE
)){
die
"ERROR: $file is empty\n"
;
}
open
(
$TP53_FILE
,
">"
,
"$output_directory"
.
"TP53/$file_name"
.
"$suffix"
) or
die
"ERROR: could not create file: $output_directory"
.
"TP53/$file_name"
.
"$suffix"
;
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
while
(
$line
=~ m/^
print
$TP53_FILE
"$line\n"
;
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
}
while
(1){
@line_data
= (
split
(/\t/,
$line
));
if
(
( !(
$line_data
[1] =~ m/^[0-9]+$/) ) ||
( !(
$line_data
[0] =~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|x|y|[1-9])/) ) ||
(
length
(
$line_data
[4]) <=
length
(
$line_data
[3]) )
){
warn
"ERROR: invalid chromosome or non-insertion VCF data line found and skipped:\t$line\n"
;
$line
= <
$CURRENT_FILE
>;
unless
(
$line
){
last
;}
chomp
(
$line
);
next
;
}
$chr
= $2;
if
(
$chr
eq
'x'
){
$chr
=
'X'
;
}
if
(
$chr
eq
'y'
){
$chr
=
'Y'
;
}
$chromosome_insertion_count
{
$chr
}++;
if
(!
defined
(@{
$genome_insertion_data
{
$chr
}}[
int
(
$line_data
[1]/
$bin_size
)])){
${
$genome_insertion_data
{
$chr
}}[
int
(
$line_data
[1]/
$bin_size
)] = 1;
}
else
{
${
$genome_insertion_data
{
$chr
}}[
int
(
$line_data
[1]/
$bin_size
)]++;
}
foreach
my
$bp
(@{
$genome_trans_breakpoints
{
$chr
}}){
if
(
$line_data
[1] <
$bp
+10 &&
$line_data
[1] >
$bp
-10){
push
(@{
$genome_trans_insertion_breakpoints
{
$chr
}},
$bp
);
}
}
if
(
(
$chr
ne
'X'
&&
$chr
ne
'Y'
) &&
(
$chr
==17 )
){
if
(
$line_data
[1] >=
$TP53_start
&&
$line_data
[1] <=
$TP53_end
){
$$tp53_mutation_found_ref
= 1;
$insertion_found
= 1;
print
$TP53_FILE
"$line\n"
;
}
}
$line
= <
$CURRENT_FILE
>;
unless
(
$line
){
last
;}
chomp
(
$line
);
}
close
(
$CURRENT_FILE
);
close
(
$TP53_FILE
);
if
(
$insertion_found
!=1){
my
$dir
=
"$output_directory"
.
"TP53/$file_name"
.
"$suffix"
;
$rm_insertion_file_result
= `rm
$dir
`;
}
$insertion_found
= 0;
}
return
(\
%genome_insertion_data
, \
%chromosome_insertion_count
, \
%genome_trans_insertion_breakpoints
);
}
sub
analyze_loh_data {
my
$output_directory
=
shift
;
my
$loh_files_array_ref
=
shift
;
my
@loh_files
=
@$loh_files_array_ref
;
my
$tp53_mutation_found_ref
=
shift
;
my
@file_data
;
my
$CURRENT_FILE
;
my
$TP53_FILE
;
my
$line
;
my
@line_data
;
my
%chromsome_loh_breakpoints
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
foreach
my
$file
(
@loh_files
){
open
(
$CURRENT_FILE
,
"<"
,
$file
) or
die
"ERROR: could not open file at path $file\n"
;
if
(
eof
(
$CURRENT_FILE
)){
close
(
$CURRENT_FILE
);
die
"ERROR: $file is empty\n"
;
}
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
if
(!(
$line
=~ m/^
close
(
$CURRENT_FILE
);
die
"ERROR: header of loh file $file is invalid\n"
;
}
while
( !(
eof
(
$CURRENT_FILE
)) ){
$line
= <
$CURRENT_FILE
>;
chomp
(
$line
);
if
(!(
$line
=~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])\t[0-9]+\t[0-9]+\t([0-9]+|\.)$/)){
die
"ERROR: invalid line found ($line) in file $file\n"
;
}
@line_data
= (
split
(/\t/,
$line
));
push
(
@file_data
,[
@line_data
]);
}
close
(
$CURRENT_FILE
);
}
@file_data
= check_for_overlaps(
"loh"
, \
@file_data
);
mkdir
(
"$output_directory"
.
"TP53"
,0770)
unless
(-d
"$output_directory"
.
"TP53"
);
if
(!(-e
"$output_directory"
.
"TP53"
)){
die
"ERROR: could not create folder $output_directory"
.
"TP53\n"
;
}
open
(
$TP53_FILE
,
">"
,
"$output_directory"
.
"TP53/TP53.spl"
) or
die
"ERROR: could not create file: $output_directory"
.
"TP53/TP53.spl"
;
print
$TP53_FILE
"#chr\tstart\tend\tquality"
;
for
(
my
$n
= 0;
$n
<
scalar
(
@file_data
);
$n
++){
if
(!(
$file_data
[
$n
][0] =~ m/^(
chr
)?(1[0-9]|2[0-2]|X|Y|[1-9])$/)){
die
"ERROR: invalid chromosome field detected\n"
;
}
my
$chr
= $2;
push
(@{
$chromsome_loh_breakpoints
{
$chr
}}, (
$file_data
[
$n
][1],
$file_data
[
$n
][2]));
if
(
(
$chr
ne
'X'
&&
$chr
ne
'Y'
) &&
(
$chr
==17 )
){
if
(
( (
$file_data
[
$n
][1] >=
$TP53_start
&&
$file_data
[
$n
][1] <=
$TP53_end
) || (
$file_data
[
$n
][2] >=
$TP53_start
&&
$file_data
[
$n
][2] <=
$TP53_end
) )
){
$$tp53_mutation_found_ref
= 1;
print
$TP53_FILE
"\n"
;
for
(
my
$i
= 0;
$i
<
scalar
(@{
$file_data
[
$n
]});
$i
++){
print
$TP53_FILE
"$file_data[$n][$i]"
;
if
(
$i
!=
scalar
(@{
$file_data
[
$n
]})-1){
print
$TP53_FILE
"\t"
;
}
}
}
}
}
close
(
$TP53_FILE
);
return
(\
%chromsome_loh_breakpoints
);
}
sub
calculate_genome_localization {
my
$output_directory
=
shift
;
my
$chromosome_copy_number_count_hash_ref
=
shift
;
my
$chromosome_translocation_count_hash_ref
=
shift
;
my
%chromosome_mutation_count
;
my
$density
;
my
$OUTPUT_FILE
;
for
(
my
$i
=1;
$i
<23;
$i
++){
$chromosome_mutation_count
{
$i
} = 0;
}
$chromosome_mutation_count
{
'X'
} = 0;
$chromosome_mutation_count
{
'Y'
} = 0;
for
my
$cnv_key1
(
keys
%$chromosome_copy_number_count_hash_ref
){
for
my
$cnv_key2
(
keys
%{
$chromosome_copy_number_count_hash_ref
->{
$cnv_key1
}}){
$chromosome_mutation_count
{
$cnv_key1
} +=
$chromosome_copy_number_count_hash_ref
->{
$cnv_key1
}->{
$cnv_key2
};
}
}
for
my
$trans_key1
(
keys
%$chromosome_translocation_count_hash_ref
){
for
my
$trans_key2
(
keys
%{
$chromosome_translocation_count_hash_ref
->{
$trans_key1
}}){
$chromosome_mutation_count
{
$trans_key1
} +=
$chromosome_translocation_count_hash_ref
->{
$trans_key1
}->{
$trans_key2
};
}
}
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory/genome_localization.log"
) or
die
"ERROR: could not create file $output_directory/genome_localization.log\n"
;
print
$OUTPUT_FILE
"#chr\tcount\tdensity\n"
;
for
my
$chr
(
sort
keys
%chromosome_mutation_count
){
$density
=
$chromosome_mutation_count
{
$chr
}/
$chromosome_length
{
$chr
};
print
$OUTPUT_FILE
"$chr"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$chromosome_mutation_count
{
$chr
};
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
"$density"
;
print
$OUTPUT_FILE
"\n"
;
$chromosome_mutation_count
{
$chr
} =
$density
;
}
close
(
$OUTPUT_FILE
);
return
(\
%chromosome_mutation_count
);
}
sub
calculate_chromosome_localization {
my
$output_directory
=
shift
;
my
$genome_cnv_data_hash_ref
=
shift
;
my
$genome_trans_data_hash_ref
=
shift
;
my
$bin_size
=
shift
;
my
$window_size
=
shift
;
my
@suspect_regions
;
my
@likely_regions
;
my
$in_suspect_region
= 0;
my
$in_likely_region
= 0;
my
$suspect_chr
= -1;
my
$suspect_start
= -1;
my
$suspect_end
= -1;
my
$likely_chr
= -1;
my
$likely_start
= -1;
my
$likely_end
= -1;
my
%genome_cnv_data_windows
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
my
%genome_trans_data_windows
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
my
%genome_mutation_data_windows
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
my
$current_chr
;
my
@current_chr_data
= ();
my
$genome_mean_mutation_density
= 0;
my
$total_genome_windows
= 0;
my
$genome_mutation_density_standard_deviation
= 0;
my
$OUTPUT_FILE
;
$output_directory
=
$output_directory
.
"mutation_clustering"
;
mkdir
(
"$output_directory"
,0770)
unless
(-d
"$output_directory"
);
if
(!(-e
"$output_directory"
)){
die
"ERROR: could not create folder $output_directory\n"
;
}
mkdir
(
"$output_directory/cnv"
,0770)
unless
(-d
"$output_directory/cnv"
);
if
(!(-e
"$output_directory"
)){
die
"ERROR: could not create folder $output_directory/cnv\n"
;
}
mkdir
(
"$output_directory/translocations"
,0770)
unless
(-d
"$output_directory/translocations"
);
if
(!(-e
"$output_directory"
)){
die
"ERROR: could not create folder $output_directory/translocations\n"
;
}
mkdir
(
"$output_directory/all_types"
,0770)
unless
(-d
"$output_directory/all_types"
);
if
(!(-e
"$output_directory"
)){
die
"ERROR: could not create folder $output_directory/all_types\n"
;
}
for
my
$cnv_key
(
keys
%$genome_cnv_data_hash_ref
){
@current_chr_data
= @{
$genome_cnv_data_hash_ref
->{
$cnv_key
}};
if
(
scalar
(
@current_chr_data
) > 0){
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory/cnv/chr$cnv_key"
.
"_cnv_localization.log"
) or
die
"ERROR: could not create file $output_directory/cnv/chr$cnv_key"
.
"_cnv_localization.log"
;
print
$OUTPUT_FILE
"#chr\tstart\tend\tdensity"
;
@{
$genome_cnv_data_windows
{
$cnv_key
}}[0] = 0;
for
(
my
$chr_pos
= 0;
$chr_pos
<
$window_size
;
$chr_pos
++){
my
%region_hash
;
if
(!
defined
(
$current_chr_data
[
$chr_pos
])){
next
;
}
%region_hash
= %{
$current_chr_data
[
$chr_pos
]};
@{
$genome_cnv_data_windows
{
$cnv_key
}}[0] +=
$region_hash
{
'BPcount'
};
}
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$cnv_key"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
"0"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
(
$window_size
)
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
if
(!
defined
(@{
$genome_cnv_data_windows
{
$cnv_key
}}[0])){
print
$OUTPUT_FILE
"0"
;
}
else
{
print
$OUTPUT_FILE
(POSIX::ceil((@{
$genome_cnv_data_windows
{
$cnv_key
}}[0])/2))/(
$window_size
*$bin_size
);
@{
$genome_mutation_data_windows
{
$cnv_key
}}[0] += POSIX::ceil(@{
$genome_cnv_data_windows
{
$cnv_key
}}[0]/2);
}
for
(
my
$chr_pos
= 1;
$chr_pos
<
scalar
(
@current_chr_data
);
$chr_pos
++){
if
( ((
$chr_pos
+(
$window_size
-1))
*$bin_size
) >
$chromosome_length
{
$cnv_key
} ){
last
;
}
@{
$genome_cnv_data_windows
{
$cnv_key
}}[
$chr_pos
] = 0;
my
%past_region_hash
;
my
%next_region_hash
;
my
$prev_value
= 0;
my
$next_value
= 0;
if
(
defined
(
$current_chr_data
[
$chr_pos
-1])){
%past_region_hash
= %{
$current_chr_data
[
$chr_pos
-1]};
$prev_value
=
$past_region_hash
{
'BPcount'
};
}
if
(
defined
(
$current_chr_data
[
$chr_pos
+(
$window_size
-1)])){
%next_region_hash
= %{
$current_chr_data
[
$chr_pos
+(
$window_size
-1)]};
$next_value
=
$next_region_hash
{
'BPcount'
};
}
@{
$genome_cnv_data_windows
{
$cnv_key
}}[
$chr_pos
] += (@{
$genome_cnv_data_windows
{
$cnv_key
}}[
$chr_pos
-1]) - (
$prev_value
) + (
$next_value
);
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$cnv_key"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$chr_pos
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
(
$chr_pos
+
$window_size
)
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
if
(!
defined
(@{
$genome_cnv_data_windows
{
$cnv_key
}}[
$chr_pos
])){
print
$OUTPUT_FILE
"0"
;
}
else
{
print
$OUTPUT_FILE
(POSIX::ceil((@{
$genome_cnv_data_windows
{
$cnv_key
}}[
$chr_pos
])/2))/(
$window_size
*$bin_size
);
@{
$genome_mutation_data_windows
{
$cnv_key
}}[
$chr_pos
] += POSIX::ceil(@{
$genome_cnv_data_windows
{
$cnv_key
}}[
$chr_pos
]/2);
}
}
close
(
$OUTPUT_FILE
);
}
@current_chr_data
= ();
}
for
my
$trans_key
(
keys
%$genome_trans_data_hash_ref
){
@current_chr_data
= @{
$genome_trans_data_hash_ref
->{
$trans_key
}};
if
(
scalar
(
@current_chr_data
) > 0){
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory/translocations/chr$trans_key"
.
"_translocation_localization.log"
) or
die
"ERROR: could not create file $output_directory/translocations/chr$trans_key"
.
"_translocation_localization.log"
;
print
$OUTPUT_FILE
"#chr\tstart\tend\tdensity"
;
@{
$genome_trans_data_windows
{
$trans_key
}}[0] = 0;
for
(
my
$chr_pos
= 0;
$chr_pos
<
$window_size
;
$chr_pos
++){
my
%region_hash
;
if
(!
defined
(
$current_chr_data
[
$chr_pos
])){
next
;
}
%region_hash
= %{
$current_chr_data
[
$chr_pos
]};
my
%trans_hash_in
;
my
%trans_hash_out
;
my
$size
= 0;
if
(
defined
(
$region_hash
{
'in'
})){
%trans_hash_in
= %{
$region_hash
{
'in'
}};
for
my
$key
(
keys
%trans_hash_in
){
$size
= @{
$trans_hash_in
{
$key
}};
$size
=
$size
/2;
@{
$genome_trans_data_windows
{
$trans_key
}}[0] +=
$size
;
}
}
if
(
defined
(
$region_hash
{
'out'
})){
%trans_hash_out
= %{
$region_hash
{
'out'
}};
for
my
$key
(
keys
%trans_hash_out
){
if
(
$key
eq
$trans_key
){
next
;
}
$size
= @{
$trans_hash_out
{
$key
}};
$size
=
$size
/2;
@{
$genome_trans_data_windows
{
$trans_key
}}[0] +=
$size
;
}
}
}
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$trans_key"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
"0"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
(
$window_size
)
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
if
(!
defined
(@{
$genome_trans_data_windows
{
$trans_key
}}[0])){
print
$OUTPUT_FILE
"0"
;
}
else
{
print
$OUTPUT_FILE
(POSIX::ceil(@{
$genome_trans_data_windows
{
$trans_key
}}[0]))/(
$window_size
*$bin_size
);
@{
$genome_mutation_data_windows
{
$trans_key
}}[0] += POSIX::ceil(@{
$genome_trans_data_windows
{
$trans_key
}}[0]);
}
for
(
my
$chr_pos
= 1;
$chr_pos
<
scalar
(
@current_chr_data
);
$chr_pos
++){
if
( ((
$chr_pos
+(
$window_size
-1))
*$bin_size
) >
$chromosome_length
{
$trans_key
} ){
last
;
}
@{
$genome_trans_data_windows
{
$trans_key
}}[
$chr_pos
] = 0;
my
%prev_region_hash
;
my
%next_region_hash
;
my
$prev_value
= 0;
my
$next_value
= 0;
if
(
defined
(
$current_chr_data
[
$chr_pos
-1])){
%prev_region_hash
= %{
$current_chr_data
[
$chr_pos
-1]};
my
$size
= 0;
my
%prev_trans_hash_in
;
my
%prev_trans_hash_out
;
if
(
defined
(
$prev_region_hash
{
'in'
})){
%prev_trans_hash_in
= %{
$prev_region_hash
{
'in'
}};
for
my
$key
(
keys
%prev_trans_hash_in
){
$size
= @{
$prev_trans_hash_in
{
$key
}};
$size
=
$size
/2;
$prev_value
+=
$size
;
}
}
if
(
defined
(
$prev_region_hash
{
'out'
})){
%prev_trans_hash_out
= %{
$prev_region_hash
{
'out'
}};
for
my
$key
(
keys
%prev_trans_hash_out
){
if
(
$key
eq
$trans_key
){
next
;
}
$size
= @{
$prev_trans_hash_out
{
$key
}};
$size
=
$size
/2;
$prev_value
+=
$size
;
}
}
}
if
(
defined
(
$current_chr_data
[
$chr_pos
+(
$window_size
-1)])){
%next_region_hash
= %{
$current_chr_data
[
$chr_pos
+(
$window_size
-1)]};
my
$size
= 0;
my
%next_trans_hash_in
;
my
%next_trans_hash_out
;
if
(
defined
(
$next_region_hash
{
'in'
})){
%next_trans_hash_in
= %{
$next_region_hash
{
'in'
}};
for
my
$key
(
keys
%next_trans_hash_in
){
$size
= @{
$next_trans_hash_in
{
$key
}};
$size
=
$size
/2;
$next_value
+=
$size
;
}
}
if
(
defined
(
$next_region_hash
{
'out'
})){
%next_trans_hash_out
= %{
$next_region_hash
{
'out'
}};
for
my
$key
(
keys
%next_trans_hash_out
){
if
(
$key
eq
$trans_key
){
next
;
}
$size
= @{
$next_trans_hash_out
{
$key
}};
$size
=
$size
/2;
$next_value
+=
$size
;
}
}
}
@{
$genome_trans_data_windows
{
$trans_key
}}[
$chr_pos
] += (@{
$genome_trans_data_windows
{
$trans_key
}}[
$chr_pos
-1]) - (
$prev_value
) + (
$next_value
);
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$trans_key"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$chr_pos
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
(
$chr_pos
+
$window_size
)
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
if
(!
defined
(@{
$genome_trans_data_windows
{
$trans_key
}}[
$chr_pos
])){
print
$OUTPUT_FILE
"0"
;
}
else
{
print
$OUTPUT_FILE
(POSIX::ceil(@{
$genome_trans_data_windows
{
$trans_key
}}[
$chr_pos
]))/(
$window_size
*$bin_size
);
@{
$genome_mutation_data_windows
{
$trans_key
}}[
$chr_pos
] += POSIX::ceil(@{
$genome_trans_data_windows
{
$trans_key
}}[
$chr_pos
]);
}
}
close
(
$OUTPUT_FILE
);
}
@current_chr_data
= ();
}
for
my
$mutation_key
(
keys
%genome_mutation_data_windows
){
if
(
scalar
(@{
$genome_mutation_data_windows
{
$mutation_key
}}) > 0){
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory/all_types/chr$mutation_key"
.
"_mutation_localization.log"
) or
die
"ERROR: could not create file $output_directory/all_types/chr$current_chr"
.
"_mutation_localization.log"
;
print
$OUTPUT_FILE
"#chr\tstart\tend\tdensity"
;
my
$density
;
for
(
my
$chr_pos
= 0;
$chr_pos
<
scalar
(@{
$genome_mutation_data_windows
{
$mutation_key
}});
$chr_pos
++){
$total_genome_windows
++;
if
(!
defined
(@{
$genome_mutation_data_windows
{
$mutation_key
}}[
$chr_pos
])){
$density
= 0;
}
else
{
$density
= (@{
$genome_mutation_data_windows
{
$mutation_key
}}[
$chr_pos
])/(
$window_size
*$bin_size
);
}
$genome_mean_mutation_density
+=
$density
;
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$mutation_key"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$chr_pos
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
(
$chr_pos
+
$window_size
)
*$bin_size
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$density
;
}
close
(
$OUTPUT_FILE
);
}
}
$genome_mean_mutation_density
=
$genome_mean_mutation_density
/
$total_genome_windows
;
for
my
$mutation_key
(
keys
%genome_mutation_data_windows
){
if
(
scalar
(@{
$genome_mutation_data_windows
{
$mutation_key
}}) > 0){
my
$density
;
for
(
my
$chr_pos
= 0;
$chr_pos
<
scalar
(@{
$genome_mutation_data_windows
{
$mutation_key
}});
$chr_pos
++){
if
(!
defined
(@{
$genome_mutation_data_windows
{
$mutation_key
}}[
$chr_pos
])){
$density
= 0;
}
else
{
$density
= (@{
$genome_mutation_data_windows
{
$mutation_key
}}[
$chr_pos
])/(
$window_size
*$bin_size
);
}
$genome_mutation_density_standard_deviation
+= (
$density
-
$genome_mean_mutation_density
)**2;
}
}
}
$genome_mutation_density_standard_deviation
= (
$genome_mutation_density_standard_deviation
/
$total_genome_windows
)**0.5;
for
my
$mutation_key
(
keys
%genome_mutation_data_windows
){
if
(
scalar
(@{
$genome_mutation_data_windows
{
$mutation_key
}}) > 0){
my
$density
;
my
$region_z_score
= 0;
for
(
my
$chr_pos
= 0;
$chr_pos
<
scalar
(@{
$genome_mutation_data_windows
{
$mutation_key
}});
$chr_pos
++){
if
(!
defined
(@{
$genome_mutation_data_windows
{
$mutation_key
}}[
$chr_pos
])){
$density
= 0;
}
else
{
$density
= (@{
$genome_mutation_data_windows
{
$mutation_key
}}[
$chr_pos
])/(
$window_size
*$bin_size
);
}
$region_z_score
= (
$density
-
$genome_mean_mutation_density
)/
$genome_mutation_density_standard_deviation
;
if
(
$region_z_score
>=
$outlier_deviation
) {
if
(
$in_suspect_region
!=1){
$suspect_start
=
$chr_pos
*$bin_size
;
$in_suspect_region
= 1;
}
$suspect_chr
=
$mutation_key
;
$suspect_end
= (
$chr_pos
+
$window_size
)
*$bin_size
;
}
elsif
(
$in_suspect_region
==1){
push
(
@suspect_regions
, (
$suspect_chr
,
$suspect_start
,
$suspect_end
));
$suspect_chr
= -1;
$suspect_start
= -1;
$suspect_end
= -1;
$in_suspect_region
= 0;
}
if
(
(
$region_z_score
<
$outlier_deviation
) &&
(
$region_z_score
>= (
$outlier_deviation
-1) )
){
if
(
$in_likely_region
!=1){
$likely_start
=
$chr_pos
*$bin_size
;
$in_likely_region
= 1;
}
$likely_chr
=
$mutation_key
;
$likely_end
= (
$chr_pos
+
$window_size
)
*$bin_size
;
}
elsif
(
$in_likely_region
==1){
push
(
@likely_regions
, (
$likely_chr
,
$likely_start
,
$likely_end
));
$likely_chr
= -1;
$likely_start
= -1;
$likely_end
= -1;
$in_likely_region
= 0;
}
}
if
(
$in_suspect_region
==1){
push
(
@suspect_regions
, (
$suspect_chr
,
$suspect_start
,
$suspect_end
));
$suspect_chr
= -1;
$suspect_start
= -1;
$suspect_end
= -1;
$in_suspect_region
= 0;
}
if
(
$in_likely_region
==1){
push
(
@likely_regions
, (
$likely_chr
,
$likely_start
,
$likely_end
));
$likely_chr
= -1;
$likely_start
= -1;
$likely_end
= -1;
$in_likely_region
= 0;
}
}
}
return
(\
@suspect_regions
, \
@likely_regions
, \
%genome_cnv_data_windows
, \
%genome_trans_data_windows
, \
%genome_mutation_data_windows
);
}
sub
check_copy_number_count {
my
$output_directory
=
shift
;
my
$chromosome_copy_number_count_hash_ref
=
shift
;
my
$OUTPUT_FILE
;
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory/copy_number_count.log"
) or
die
"ERROR: could not create file $output_directory/copy_number_count.log\n"
;
print
$OUTPUT_FILE
"#chr\tcopy_number\tnumber_of_regions"
;
for
my
$chr
(
sort
keys
%$chromosome_copy_number_count_hash_ref
){
my
%intermediate_hash
= %{
$chromosome_copy_number_count_hash_ref
->{
$chr
}};
for
my
$CN
(
sort
{
$intermediate_hash
{
$b
} <=>
$intermediate_hash
{
$a
} }
keys
%intermediate_hash
){
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$chr"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
"$CN"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$chromosome_copy_number_count_hash_ref
->{
$chr
}->{
$CN
};
}
}
close
(
$OUTPUT_FILE
);
}
sub
check_copy_number_switches {
my
$output_directory
=
shift
;
my
$chromosome_copy_number_count_hash_ref
=
shift
;
my
$switch_count
= 0;
my
$OUTPUT_FILE
;
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory/copy_number_switches.log"
) or
die
"ERROR: could not create file $output_directory/copy_number_switches.log\n"
;
print
$OUTPUT_FILE
"#chr\tswitch_count"
;
for
my
$chr
(
sort
keys
%$chromosome_copy_number_count_hash_ref
){
my
%intermediate_hash
= %{
$chromosome_copy_number_count_hash_ref
->{
$chr
}};
for
my
$CN
(
sort
{
$intermediate_hash
{
$b
} <=>
$intermediate_hash
{
$a
} }
keys
%intermediate_hash
){
if
(
$CN
!= 2){
$switch_count
+= (
$chromosome_copy_number_count_hash_ref
->{
$chr
}->{
$CN
} * 2);
}
}
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$chr"
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$switch_count
;
$switch_count
= 0;
}
close
(
$OUTPUT_FILE
);
}
sub
calculate_interchromosomal_translocation_rate {
my
$output_directory
=
shift
;
my
$chromosome_translocation_count_hash_ref
=
shift
;
my
$OUTPUT_FILE
;
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory/interchromosomal_translocation_rate.log"
) or
die
"ERROR: could not create file $output_directory/interchromosomal_translocation_rate.log\n"
;
print
$OUTPUT_FILE
"#chr1\tchr2\tcount"
;
for
my
$chr1
(
sort
keys
%$chromosome_translocation_count_hash_ref
){
my
%intermediate_hash
= %{
$chromosome_translocation_count_hash_ref
->{
$chr1
}};
for
my
$chr2
(
sort
{
$intermediate_hash
{
$b
} <=>
$intermediate_hash
{
$a
} }
keys
%intermediate_hash
){
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
$chr1
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$chr2
;
print
$OUTPUT_FILE
"\t"
;
print
$OUTPUT_FILE
$chromosome_translocation_count_hash_ref
->{
$chr1
}->{
$chr2
};
}
}
close
(
$OUTPUT_FILE
);
}
sub
analyze_suspect_regions {
my
$output_directory
=
shift
;
my
$suspect_regions_array_ref
=
shift
;
my
@suspect_regions
=
@$suspect_regions_array_ref
;
my
$genome_mutation_density_hash_ref
=
shift
;
my
%genome_mutation_density_hash
= %{
$genome_mutation_density_hash_ref
};
my
$genome_cnv_data_hash_ref
=
shift
;
my
$genome_trans_data_hash_ref
=
shift
;
my
$genome_trans_insertion_breakpoints_hash_ref
=
shift
;
my
$bin_size
=
shift
;
my
$localization_window_size
=
shift
;
my
$tp53_mutated
=
shift
;
my
$tp53_mutation_found
=
shift
;
my
$chromosome_cnv_breakpoints_hash_ref
=
shift
;
my
$chromosome_loh_breakpoints_hash_ref
=
shift
;
my
$suspect_regions_size
=
@suspect_regions
;
my
$OUTPUT_FILE
;
my
@suspect_region_data
= ();
my
$header_string
;
if
(
$suspect_regions_size
% 3 != 0){
die
"ERROR: suspect_regions_array has $suspect_regions_size entries. Value must be divisible by 3.\n"
;
}
mkdir
(
"$output_directory"
.
"suspect_regions"
,0770)
unless
(-d
"$output_directory"
.
"suspect_regions"
);
if
(!(-e
"$output_directory"
.
"suspect_regions"
)){
die
"ERROR: could not create folder $output_directory"
.
"suspect_regions"
;
}
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory"
.
"suspect_regions/suspect_regions.yml"
) or
die
"ERROR: could not create file: $output_directory"
.
"suspect_regions/suspect_regions.yml\n"
;
$header_string
=
"file: Suspect Chromothriptic Regions\n"
;
$header_string
.=
"bin_size:\t\t\t$bin_size\n"
;
$header_string
.=
"localization_window_size:\t$localization_window_size\n"
;
$header_string
.=
"\n"
;
$header_string
.=
"genome_localization_score_weight:\t$genome_localization_weight\n"
;
$header_string
.=
"chromosome_localization_score_weight:\t$chromosome_localization_weight\n"
;
$header_string
.=
"cnv_score_weight:\t\t\t$cnv_weight\n"
;
$header_string
.=
"translocation_score_weight:\t\t$translocation_weight\n"
;
$header_string
.=
"insertion_breakpoint_score_weight:\t$insertion_breakpoint_weight\n"
;
$header_string
.=
"loh_score_weight:\t\t\t$loh_weight\n"
;
$header_string
.=
"tp53_mutation_score_weight:\t\t$tp53_mutated_weight\n"
;
$header_string
.=
"\n"
;
$header_string
.=
"min_mutation_density_z_score:\t$outlier_deviation\n"
;
$header_string
.=
"---\n"
;
$header_string
.=
"\n"
;
print
$OUTPUT_FILE
$header_string
;
for
(
my
$i
= 0;
$i
<
$suspect_regions_size
;
$i
+=3){
my
@region_data
= ();
$region_data
[0] =
$suspect_regions
[
$i
];
$region_data
[1] =
$suspect_regions
[
$i
+1];
$region_data
[2] =
$suspect_regions
[
$i
+2];
(
$region_data
[3],
$region_data
[4],
$region_data
[5],
$region_data
[6],
$region_data
[7],
$region_data
[8],
$region_data
[9],
$region_data
[10],
$region_data
[11],
$region_data
[12],
$region_data
[13]) = calculate_score(
$region_data
[0],
$region_data
[1],
$region_data
[2],
$genome_cnv_data_hash_ref
,
$genome_trans_data_hash_ref
,
$genome_mutation_density_hash_ref
,
$genome_trans_insertion_breakpoints_hash_ref
,
$tp53_mutated
,
$chromosome_cnv_breakpoints_hash_ref
,
$chromosome_loh_breakpoints_hash_ref
,
$bin_size
);
push
@suspect_region_data
, [
@region_data
];
}
@suspect_region_data
=
sort
{
$b
->[3] <=>
$a
->[3] }
@suspect_region_data
;
foreach
my
$score_data
(
@suspect_region_data
){
my
$chr
=
$score_data
->[0];
my
$start
=
$score_data
->[1];
my
$end
=
$score_data
->[2];
my
$score
=
sprintf
(
"%.5f"
,
$score_data
->[3]);
my
$chr_z_score
=
$score_data
->[4];
my
$region_density
=
sprintf
(
"%e"
,
$score_data
->[5]);
my
$cnv_number_hash_ref
=
$score_data
->[6];
my
%cnv_number_hash
;
my
$num_copy_num
;
if
(
defined
(
$cnv_number_hash_ref
)){
%cnv_number_hash
= %{
$cnv_number_hash_ref
};
$num_copy_num
=
keys
%cnv_number_hash
;
}
else
{
$num_copy_num
= 0;
}
my
$cnv_density
=
sprintf
(
"%e"
,
$score_data
->[7]);
my
$intertranslocation_hash_ref
=
$score_data
->[8];
my
$translocation_density
=
$score_data
->[9];
my
%intertranslocation_hash
;
my
$num_trans_chr
;
if
(
defined
(
$intertranslocation_hash_ref
)){
%intertranslocation_hash
= %{
$intertranslocation_hash_ref
};
$num_trans_chr
=
keys
%intertranslocation_hash
;
}
else
{
$num_trans_chr
= 0;
}
my
$breakpoint_insertions_array_ref
=
$score_data
->[10];
my
@breakpoint_insertions_array
;
my
$breakpoint_percentage
;
if
(
defined
(
$breakpoint_insertions_array_ref
)){
@breakpoint_insertions_array
=
@$breakpoint_insertions_array_ref
;
$breakpoint_percentage
=
sprintf
(
"%.2f"
,(
$breakpoint_insertions_array
[0]/
$breakpoint_insertions_array
[1])*100);
}
my
$loh_size
=
$score_data
->[11];
my
$hz_size
=
$score_data
->[12];
my
$percent_hz_lost
;
if
(
defined
(
$loh_size
) &&
defined
(
$hz_size
)){
$percent_hz_lost
=
sprintf
(
"%.2f"
,(
$loh_size
/
$hz_size
)*100);
}
my
@score_array
= @{
$score_data
->[13]};
my
$chr_density
=
$genome_mutation_density_hash
{
$chr
};
my
$print_string
;
$print_string
=
"chromosome:\t$chr\n"
;
$print_string
.=
"start:\t\t$start\n"
;
$print_string
.=
"end:\t\t$end\n"
;
$print_string
.=
"\n"
;
$print_string
.=
"final_score:\t\t\t$score\n"
;
$print_string
.=
"genome_localization_score:\t"
.
$score_array
[2]
*$genome_localization_weight
.
"\t("
.
$score_array
[2].
")"
.
"\n"
;
$print_string
.=
"chromosome_localization_score:\t"
.
$score_array
[1]
*$chromosome_localization_weight
.
"\t("
.
$score_array
[1].
")"
.
"\n"
;
$print_string
.=
"cnv_score:\t\t\t"
.
$score_array
[0]
*$cnv_weight
.
"\t("
.
$score_array
[0].
")"
.
"\n"
;
$print_string
.=
"translocation_score:\t\t"
.
$score_array
[3]
*$translocation_weight
.
"\t("
.
$score_array
[3].
")"
.
"\n"
;
$print_string
.=
"insertion_breakpoint_score:\t"
.
$score_array
[4]
*$insertion_breakpoint_weight
.
"\t("
.
$score_array
[4].
")"
.
"\n"
;
$print_string
.=
"loh_score:\t\t\t"
.
$score_array
[5]
*$loh_weight
.
"\t("
.
$score_array
[5].
")"
.
"\n"
;
$print_string
.=
"tp53_score:\t\t\t"
.
$score_array
[6]
*$tp53_mutated_weight
.
"\t("
.
$score_array
[6].
")\n"
;
$print_string
.=
"\n"
;
$print_string
.=
"mutation_density_of_region:\t$region_density\n"
;
$print_string
.=
"mutation_density_of_chromosome:\t$chr_density\n"
;
$print_string
.=
"standard_deviations_from_mean_of_chromosome_mutation_density:\t$chr_z_score\n"
;
$print_string
.=
"\n"
;
$print_string
.=
"density_of_copy_number_switches: $cnv_density\n"
;
$print_string
.=
"number_of_aberrant_copy_number_states:\t$num_copy_num\n"
;
if
(
$num_copy_num
>0){
$print_string
.=
"aberrant_copy_number_states:\n"
;
foreach
my
$key
(
sort
{
$cnv_number_hash
{
$b
} <=>
$cnv_number_hash
{
$a
} }
keys
%cnv_number_hash
){
$print_string
.=
"\t$key:\t$cnv_number_hash{$key}\n"
;
}
}
$print_string
.=
"\n"
;
$print_string
.=
"density_of_translocation_breakpoints: $translocation_density\n"
;
$print_string
.=
"number_of_translocation_chromosomes:\t$num_trans_chr\n"
;
if
(
$num_trans_chr
>0){
$print_string
.=
"translocation_chromosomes:\n"
;
foreach
my
$key
(
sort
{
$intertranslocation_hash
{
$b
} <=>
$intertranslocation_hash
{
$a
} }
keys
%intertranslocation_hash
){
$print_string
.=
"\t$key:\t$intertranslocation_hash{$key}\n"
;
}
}
$print_string
.=
"\n"
;
if
(
defined
(
$breakpoint_insertions_array_ref
)){
$print_string
.=
"insertion_data:\n"
;
$print_string
.=
"\tinsertions_found_at_translocation_breakpoints:\t$breakpoint_insertions_array[0]\n"
;
$print_string
.=
"\ttotal_translocation_breakpoints:\t$breakpoint_insertions_array[1]\n"
;
$print_string
.=
"\tpercentage:\t$breakpoint_percentage"
.
"%\n"
;
$print_string
.=
"\n"
;
}
if
(
$loh_size
!=-1){
$print_string
.=
"loh_data:\n"
;
$print_string
.=
"\ttotal_size_of_loh:\t$loh_size\n"
;
$print_string
.=
"\ttotal_size_of_original_heterozygosity:\t$hz_size\n"
;
$print_string
.=
"\tpercent_heterozygosity_lost:\t$percent_hz_lost"
.
"%\n"
;
$print_string
.=
"\n"
;
}
if
(
$tp53_mutated
&&
$tp53_mutation_found
){
$print_string
.=
"tp53_mutation_present:\t1 (forced and mutations found)\n"
;
}
elsif
(
$tp53_mutated
){
$print_string
.=
"tp53_mutation_present:\t1 (forced)\n"
;
}
elsif
(
$tp53_mutation_found
){
$print_string
.=
"tp53_mutation_present:\t1 (mutations found)\n"
;
}
else
{
$print_string
.=
"tp53_mutation_present:\t0\n"
;
}
$print_string
.=
"---\n"
;
$print_string
.=
"\n"
;
print
$OUTPUT_FILE
$print_string
;
$print_string
=
""
;
}
print
$OUTPUT_FILE
"..."
;
close
(
$OUTPUT_FILE
);
}
sub
analyze_likely_regions {
my
$output_directory
=
shift
;
my
$likely_regions_array_ref
=
shift
;
my
@likely_regions
=
@$likely_regions_array_ref
;
my
$genome_mutation_density_hash_ref
=
shift
;
my
$genome_cnv_data_hash_ref
=
shift
;
my
$genome_trans_data_hash_ref
=
shift
;
my
$bin_size
=
shift
;
my
$likely_regions_size
=
@likely_regions
;
my
$OUTPUT_FILE
;
my
@likely_region_data
= ();
if
(
$likely_regions_size
% 3 != 0){
die
"ERROR: suspect_regions_array has $likely_regions_size entries. Value must be divisible by 3.\n"
;
}
mkdir
(
"$output_directory"
.
"suspect_regions"
,0770)
unless
(-d
"$output_directory"
.
"suspect_regions"
);
if
(!(-e
"$output_directory"
.
"suspect_regions"
)){
die
"ERROR: could not create folder $output_directory"
.
"suspect_regions"
;
}
open
(
$OUTPUT_FILE
,
">"
,
"$output_directory"
.
"suspect_regions/likely_regions.log"
) or
die
"ERROR: could not create file: $output_directory"
.
"suspect_regions/likely_regions.log\n"
;
print
$OUTPUT_FILE
"Likely Chromothriptic Regions\n"
;
print
$OUTPUT_FILE
"High Mutation Density Z-Score:\t$outlier_deviation\n"
;
print
$OUTPUT_FILE
"Min Mutation Density Z-Score:\t$outlier_deviation-1\n"
;
print
$OUTPUT_FILE
"---------------------------------------\n"
;
print
$OUTPUT_FILE
"#chr\tstart\tend\tmutation_density"
;
for
(
my
$i
= 0;
$i
<
$likely_regions_size
;
$i
+=3){
my
@region_data
= ();
$region_data
[0] =
$likely_regions
[
$i
];
$region_data
[1] =
$likely_regions
[
$i
+1];
$region_data
[2] =
$likely_regions
[
$i
+2];
(
$region_data
[3],
$region_data
[4]) = calculate_region_mutation_density_score(
$region_data
[0],
$region_data
[1],
$region_data
[2],
$genome_cnv_data_hash_ref
,
$genome_trans_data_hash_ref
,
$genome_mutation_density_hash_ref
,
$bin_size
);
push
@likely_region_data
, [
@region_data
];
}
@likely_region_data
=
sort
{
$b
->[3] <=>
$a
->[3] }
@likely_region_data
;
foreach
my
$i
(
@likely_region_data
){
my
$chr
=
$i
->[0];
my
$start
=
$i
->[1];
my
$end
=
$i
->[2];
my
$region_density
=
$i
->[3];
print
$OUTPUT_FILE
"\n"
;
print
$OUTPUT_FILE
"$chr\t$start\t$end\t$region_density"
;
}
close
(
$OUTPUT_FILE
);
}
sub
calculate_score{
my
$chr
=
shift
;
my
$start
=
shift
;
my
$end
=
shift
;
my
$genome_cnv_data_hash_ref
=
shift
;
my
$genome_trans_data_hash_ref
=
shift
;
my
$genome_mutation_density_hash_ref
=
shift
;
my
$genome_trans_insertion_breakpoints_hash_ref
=
shift
;
my
$tp53_mutated
=
shift
;
my
$chromosome_cnv_breakpoints_hash_ref
=
shift
;
my
$chromosome_loh_breakpoints_hash_ref
=
shift
;
my
$bin_size
=
shift
;
my
$cnv_score
= 0;
my
$mutation_density_score
= 0;
my
$genome_localization_score
= 0;
my
$translocation_score
= 0;
my
$insertion_breakpoint_score
= 0;
my
$loh_score
= 0;
my
$final_score
= 0;
my
@score_array
;
my
$chr_mutation_density
;
my
$chr_z_score
;
my
$cnv_number_hash_ref
;
my
$cnv_density
;
my
$translocation_density
;
my
$mutation_density
;
my
$intertranslocation_hash_ref
;
my
$breakpoint_insertions_array_ref
;
my
$loh_size
= -1;
my
$heterozygous_size
;
(
$cnv_score
,
$cnv_number_hash_ref
,
$cnv_density
) = calculate_copy_number_scores(
$chr
,
$start
,
$end
,
$genome_cnv_data_hash_ref
,
$bin_size
);
(
$genome_localization_score
,
$chr_z_score
,
$chr_mutation_density
) = calculate_genome_localization_score(
$chr
,
$genome_mutation_density_hash_ref
);
(
$translocation_score
,
$intertranslocation_hash_ref
,
$translocation_density
) = calculate_translocation_score(
$chr
,
$start
,
$end
,
$genome_trans_data_hash_ref
,
$bin_size
);
if
(
defined
(
$genome_trans_insertion_breakpoints_hash_ref
)){
(
$insertion_breakpoint_score
,
$breakpoint_insertions_array_ref
) = calculate_insertion_breakpoint_score(
$chr
,
$start
,
$end
,
$genome_trans_data_hash_ref
,
$genome_trans_insertion_breakpoints_hash_ref
,
$bin_size
);
}
(
$mutation_density
,
$mutation_density_score
) = calculate_region_mutation_density_score(
$chr
,
$start
,
$end
,
$genome_cnv_data_hash_ref
,
$genome_trans_data_hash_ref
,
$genome_mutation_density_hash_ref
,
$bin_size
);
if
(
defined
(
$chromosome_loh_breakpoints_hash_ref
)){
(
$loh_score
,
$loh_size
,
$heterozygous_size
) = calculate_loh_score(
$chr
,
$start
,
$end
,
$chromosome_cnv_breakpoints_hash_ref
,
$chromosome_loh_breakpoints_hash_ref
);
}
$final_score
= (
$cnv_score
*$cnv_weight
) + (
$mutation_density_score
*$chromosome_localization_weight
) + (
$genome_localization_score
*$genome_localization_weight
) + (
$translocation_score
*$translocation_weight
) + (
$insertion_breakpoint_score
*$insertion_breakpoint_weight
) + (
$tp53_mutated
*$tp53_mutated_weight
) + (
$loh_score
*$loh_weight
);
push
(
@score_array
, (
$cnv_score
,
$mutation_density_score
,
$genome_localization_score
,
$translocation_score
,
$insertion_breakpoint_score
,
$loh_score
,
$tp53_mutated
));
return
(
$final_score
,
$chr_z_score
,
$mutation_density
,
$cnv_number_hash_ref
,
$cnv_density
,
$intertranslocation_hash_ref
,
$translocation_density
,
$breakpoint_insertions_array_ref
,
$loh_size
,
$heterozygous_size
, \
@score_array
);
}
sub
calculate_copy_number_scores {
my
$chr
=
shift
;
my
$start
=
shift
;
my
$end
=
shift
;
my
$genome_cnv_data_hash_ref
=
shift
;
my
%genome_cnv_data_hash
=
%$genome_cnv_data_hash_ref
;
my
$bin_size
=
shift
;
my
$start_index
=
$start
/ (
$bin_size
);
my
$end_index
=
$end
/ (
$bin_size
);
my
$cnv_score
= 0;
my
%cnv_number_hash
;
my
$cnv_switch_count
= 0;
my
$cnv_switch_density
= 0;
my
@chr_data
;
my
%cnv_hash
;
my
$mean
= 0;
my
$SD
= 0;
my
%cnv_significant
;
my
$significant_count
= 0;
if
(
defined
(
$genome_cnv_data_hash
{
$chr
})){
@chr_data
= @{
$genome_cnv_data_hash
{
$chr
}};
for
(
my
$i
=
$start_index
;
$i
<
$end_index
+1;
$i
++){
if
(!
defined
(
$chr_data
[
$i
])){
next
;
}
%cnv_hash
= %{
$chr_data
[
$i
]};
for
my
$key
(
keys
%cnv_hash
){
if
(
$key
eq
'BPcount'
){
$cnv_switch_count
+=
$cnv_hash
{
$key
};
}
else
{
$cnv_number_hash
{
$key
}+=
$cnv_hash
{
$key
};
}
}
}
$cnv_switch_density
=
$cnv_switch_count
/ (
$end
-
$start
);
for
my
$key
(
keys
%cnv_number_hash
){
$cnv_number_hash
{
$key
} = POSIX::ceil(
$cnv_number_hash
{
$key
});
$mean
+=
$cnv_number_hash
{
$key
};
}
if
(
scalar
(
keys
%cnv_number_hash
)==0){
$cnv_score
= 0;
return
(
$cnv_score
, \
%cnv_number_hash
,
$cnv_switch_density
);
}
$mean
=
$mean
/(
scalar
(
keys
%cnv_number_hash
));
for
my
$key
(
keys
%cnv_number_hash
){
$SD
+= (
$cnv_number_hash
{
$key
}-
$mean
)**2;
}
$SD
=
$SD
/(
scalar
(
keys
%cnv_number_hash
));
$SD
=
$SD
**0.5;
for
my
$key
(
keys
%cnv_number_hash
){
if
(
(
$SD
==0 )||
( ((
$cnv_number_hash
{
$key
}-
$mean
)/
$SD
) >= -1
*$outlier_deviation
)
){
$cnv_significant
{
$key
} =
$cnv_number_hash
{
$key
};
$cnv_score
=
$cnv_score
+
$cnv_significant
{
$key
}**2;
}
}
$cnv_score
=
$cnv_score
/ (
scalar
(
keys
%cnv_significant
));
$cnv_score
=
log
(
$cnv_score
)/
log
(2);
$cnv_score
+= 1;
$cnv_score
= 1 - (1/
$cnv_score
);
$cnv_score
=
$cnv_score
/(
scalar
(
keys
%cnv_significant
));
}
return
(
$cnv_score
, \
%cnv_number_hash
,
$cnv_switch_density
);
}
sub
calculate_genome_localization_score {
my
$chr
=
shift
;
my
$genome_mutation_density_hash_ref
=
shift
;
my
%genome_mutation_density_hash
= %{
$genome_mutation_density_hash_ref
};
my
$chr_mutation_density
=
$genome_mutation_density_hash
{
$chr
};
my
$mean_density
= 0;
my
$standard_deviation
= 0;
my
$z_score
= 0;
my
$p_val
= 0;
for
my
$key
(
keys
%genome_mutation_density_hash
){
$mean_density
+=
$genome_mutation_density_hash
{
$key
}/
$chromosome_length
{
$chr
};
}
$mean_density
=
$mean_density
/ 24;
for
my
$key
(
keys
%genome_mutation_density_hash
){
$standard_deviation
+= (((
$genome_mutation_density_hash
{
$key
}) - (
$mean_density
) )**2);
}
$standard_deviation
=
$standard_deviation
/ 24;
$standard_deviation
= (
$standard_deviation
)**0.5;
if
(
$mean_density
== 0 ||
$standard_deviation
== 0){
return
(0, 0,
$chr_mutation_density
);
}
$z_score
= (
$chr_mutation_density
-
$mean_density
) /
$standard_deviation
;
$p_val
= Statistics::Distributions::uprob(
$z_score
);
$p_val
= 0.5-
$p_val
;
$p_val
=
$p_val
/0.5;
if
(
$z_score
< 0){
$p_val
= 0;
}
return
(
$p_val
,
$z_score
,
$chr_mutation_density
);
}
sub
calculate_region_mutation_density_score {
my
$chr
=
shift
;
my
$start
=
shift
;
my
$end
=
shift
;
my
$genome_cnv_data_hash_ref
=
shift
;
my
%genome_cnv_data_hash
= %{
$genome_cnv_data_hash_ref
};
my
$genome_trans_data_hash_ref
=
shift
;
my
%genome_trans_data_hash
= %{
$genome_trans_data_hash_ref
};
my
$genome_mutation_density_hash_ref
=
shift
;
my
%genome_mutation_density_hash
= %{
$genome_mutation_density_hash_ref
};
my
$bin_size
=
shift
;
my
$start_index
=
$start
/
$bin_size
;
my
$end_index
=
$end
/
$bin_size
;
my
$mean_chr_mutation_density
=
$genome_mutation_density_hash
{
$chr
};
my
$chis_stat
;
my
$mutation_count
= 0;
my
$cnv_count
= 0;
my
$trans_count
= 0;
my
$mutation_density
= 0;
my
$mutation_density_score
;
my
@chr_data
;
my
%cnv_hash
;
my
%trans_hash
;
if
(
defined
(
$genome_cnv_data_hash
{
$chr
})){
@chr_data
= @{
$genome_cnv_data_hash
{
$chr
}};
for
(
my
$i
=
$start_index
;
$i
<
$end_index
+1;
$i
++){
if
(!
defined
(
$chr_data
[
$i
])){
next
;
}
%cnv_hash
= %{
$chr_data
[
$i
]};
$cnv_count
+=
$cnv_hash
{
'BPcount'
};
}
}
@chr_data
= ();
if
(
defined
(
$genome_trans_data_hash
{
$chr
})){
@chr_data
= @{
$genome_trans_data_hash
{
$chr
}};
for
(
my
$i
=
$start_index
;
$i
<
$end_index
+1;
$i
++){
if
(!
defined
(
$chr_data
[
$i
])){
next
;
}
%trans_hash
= %{
$chr_data
[
$i
]};
$trans_count
+=
$trans_hash
{
'BPcount'
};
}
}
$trans_count
= POSIX::ceil(
$trans_count
/2);
$mutation_count
=
$cnv_count
+
$trans_count
;
$mutation_density
=
$mutation_count
/ (
$end
-
$start
);
$chis_stat
=
abs
(((
log
(
$mutation_density
)-
log
(
$mean_chr_mutation_density
))**2)/(
log
(
$mean_chr_mutation_density
)));
$mutation_density_score
= 1-(Statistics::Distributions::chisqrprob(1,
$chis_stat
));
return
(
$mutation_density
,
$mutation_density_score
);
}
sub
calculate_translocation_score {
my
$chr
=
shift
;
my
$start
=
shift
;
my
$end
=
shift
;
my
$genome_trans_data_hash_ref
=
shift
;
my
%genome_trans_data_hash
= %{
$genome_trans_data_hash_ref
};
my
$bin_size
=
shift
;
my
$start_index
=
$start
/ (
$bin_size
);
my
$end_index
=
$end
/ (
$bin_size
);
my
$translocation_density
= 0;
my
@chr_data
;
my
%trans_breakpoints
;
my
%trans_breakpoint_spreads
;
my
@significant_chrs
;
my
@diffs
;
my
$diff_sum
;
my
$diff_count
;
my
%trans_number_hash
;
my
$mean
= 0;
my
$SD
= 0;
my
$weighted_sum
= 0.00;
my
$size
= 0.00;
my
$count
= 0;
my
$translocation_score
= 0;
my
$spread_factor
= 0;
my
$translocation_count
= 0;
if
(
defined
(
$genome_trans_data_hash
{
$chr
})){
@chr_data
= @{
$genome_trans_data_hash
{
$chr
}};
for
(
my
$i
=
$start_index
;
$i
<
$end_index
+1;
$i
++){
if
(!
defined
(
$chr_data
[
$i
])){
next
;
}
my
%trans_hash
= %{
$chr_data
[
$i
]};
my
%trans_hash_in
;
my
%trans_hash_out
;
if
(
defined
(
$trans_hash
{
'in'
})){
%trans_hash_in
= %{
$trans_hash
{
'in'
}};
for
my
$key
(
keys
%trans_hash_in
){
$size
= @{
$trans_hash_in
{
$key
}};
$size
=
$size
/2;
$trans_number_hash
{
$key
} +=
$size
;
push
(@{
$trans_breakpoints
{
$key
}},(@{
$trans_hash_in
{
$key
}}));
}
}
if
(
defined
(
$trans_hash
{
'out'
})){
%trans_hash_out
= %{
$trans_hash
{
'out'
}};
for
my
$key
(
keys
%trans_hash_out
){
$size
= @{
$trans_hash_out
{
$key
}};
$count
=
$size
;
if
(
$key
eq
$chr
){
foreach
my
$val
(@{
$trans_hash_out
{
$key
}}){
if
(
$val
>
$start
&&
$val
<
$end
){
$count
--;
}
}
}
$count
=
$count
/2;
$trans_number_hash
{
$key
} +=
$count
;
push
(@{
$trans_breakpoints
{
$key
}},(@{
$trans_hash_out
{
$key
}}));
}
}
}
$count
= 0;
if
(
keys
(
%trans_number_hash
) == 0){
$translocation_score
= 0;
$translocation_density
= 0;
return
(
$translocation_score
, \
%trans_number_hash
,
$translocation_density
);
}
for
my
$key
(
keys
%trans_number_hash
){
$trans_number_hash
{
$key
} = POSIX::ceil(
$trans_number_hash
{
$key
});
$count
+=
$trans_number_hash
{
$key
};
}
$translocation_density
=
$count
/ (
$end
-
$start
);
(
$SD
,
$mean
) = standard_deviation_and_mean(\
%trans_number_hash
,0);
for
my
$key
(
keys
%trans_number_hash
){
if
(
(
$SD
==0 )||
( ((
$trans_number_hash
{
$key
}-
$mean
)/
$SD
)>-1
*$outlier_deviation
)
){
push
(
@significant_chrs
,
$key
);
}
}
foreach
my
$key
(
@significant_chrs
){
$translocation_count
+=
$trans_number_hash
{
$key
};
}
foreach
my
$key
(
@significant_chrs
){
@{
$trans_breakpoints
{
$key
}} =
sort
{
$a
<=>
$b
} @{
$trans_breakpoints
{
$key
}};
$size
= @{
$trans_breakpoints
{
$key
}};
@diffs
= ();
$diff_sum
= 0;
$diff_count
= 0;
for
(
my
$i
= 1;
$i
<
$size
;
$i
++){
push
(
@diffs
, @{
$trans_breakpoints
{
$key
}}[
$i
]-@{
$trans_breakpoints
{
$key
}}[
$i
-1]);
}
if
(
$size
==1){
$trans_breakpoint_spreads
{
$key
} = 0;
$diff_count
= 1;
}
else
{
(
$SD
,
$mean
) = standard_deviation_and_mean(\
@diffs
,1);
foreach
my
$val
(
@diffs
){
if
(
(
$SD
==0 )||
( ((
$val
-
$mean
)/
$SD
)<
$outlier_deviation
)
){
$diff_sum
+=
$val
;
$diff_count
++;
}
}
}
$trans_breakpoint_spreads
{
$key
} =
$diff_sum
/
$diff_count
;
$spread_factor
= (
log
(
$trans_breakpoint_spreads
{
$key
}+1)/
log
(10))/((
log
(
$expected_mutation_density
)/
log
(10))*-1);
if
(
$spread_factor
==0){
$spread_factor
= 1;
}
$weighted_sum
+= (
$trans_number_hash
{
$key
}/
$spread_factor
)*(
$trans_number_hash
{
$key
}/
$translocation_count
);
}
my
$t2
= (1-(1/(
log
(1+
$weighted_sum
)/
log
(2))));
$size
=
@significant_chrs
;
$translocation_score
= (1-(1/(
log
(1+
$weighted_sum
)/
log
(2))));
if
(
$size
<
$translocation_cut_off_count
&&
$size
>2){
$translocation_score
= (1-(0.10*(
$size
-2)))
*$translocation_score
;
}
if
(
$size
>=
$translocation_cut_off_count
){
$translocation_score
= 0;
}
if
(
$translocation_score
>=1){
print
"score: "
.
$translocation_score
.
"\n"
;
print
"ws: "
.
$weighted_sum
.
"\n"
;
print
"sf: "
.
$spread_factor
.
"\n"
;
print
"term 1: "
;
print
(1/(1+(
log
(
$size
)/
log
(4))));
print
"\n"
;
print
"term 2: "
;
print
(1-(1/(
log
(
$weighted_sum
)/
log
(2))));
print
"\n"
;
}
}
return
(
$translocation_score
, \
%trans_number_hash
,
$translocation_density
);
}
sub
calculate_insertion_breakpoint_score {
my
$chr
=
shift
;
my
$start
=
shift
;
my
$end
=
shift
;
my
$genome_trans_data_hash_ref
=
shift
;
my
%genome_trans_data_hash
= %{
$genome_trans_data_hash_ref
};
my
$genome_trans_insertion_breakpoints_hash_ref
=
shift
;
my
%genome_trans_insertion_breakpoints_hash
= %{
$genome_trans_insertion_breakpoints_hash_ref
};
my
$bin_size
=
shift
;
my
$start_index
=
$start
/ (
$bin_size
);
my
$end_index
=
$end
/ (
$bin_size
);
my
$total_breakpoints
= 0;
my
$inserted_breakpoints
= 0;
my
$insertion_breakpoint_score
= 0;
my
@chr_data
;
my
%trans_hash
;
my
@inserted_breakpoint_list
;
my
@breakpoint_data
;
if
(
defined
(
$genome_trans_data_hash
{
$chr
})){
@chr_data
= @{
$genome_trans_data_hash
{
$chr
}};
for
(
my
$i
=
$start_index
;
$i
<
$end_index
+1;
$i
++){
if
(!
defined
(
$chr_data
[
$i
])){
next
;
}
%trans_hash
= %{
$chr_data
[
$i
]};
$total_breakpoints
+=
$trans_hash
{
'BPcount'
};
}
@inserted_breakpoint_list
= @{
$genome_trans_insertion_breakpoints_hash
{
$chr
}};
@inserted_breakpoint_list
=
sort
{
$a
<=>
$b
}
@inserted_breakpoint_list
;
foreach
my
$breakpoint
(
@inserted_breakpoint_list
){
if
(
$breakpoint
>
$end
){
last
;
}
if
(
$breakpoint
>
$start
){
$inserted_breakpoints
++;
}
}
$insertion_breakpoint_score
=
$inserted_breakpoints
/
$total_breakpoints
;
if
(
$insertion_breakpoint_score
> 1){
die
"ERROR: found a insertion_breakpoint_score greater than 1\n"
;
}
}
push
(
@breakpoint_data
,
$inserted_breakpoints
);
push
(
@breakpoint_data
,
$total_breakpoints
);
return
(
$insertion_breakpoint_score
, \
@breakpoint_data
);
}
sub
calculate_loh_score {
my
$chr
=
shift
;
my
$start
=
shift
;
my
$end
=
shift
;
my
$chromosome_cnv_breakpoints_hash_ref
=
shift
;
my
%chromosome_cnv_breakpoints_hash
= %{
$chromosome_cnv_breakpoints_hash_ref
};
my
$chromosome_loh_breakpoints_hash_ref
=
shift
;
my
%chromosome_loh_breakpoints_hash
= %{
$chromosome_loh_breakpoints_hash_ref
};
my
@cnv_breakpoints
;
my
$cnv_breakpoints_size
;
my
@loh_breakpoints
;
my
$loh_breakpoints_size
;
my
$original_heterozygous_size
=
$end
-
$start
;
my
$remaining_heterozygous_size
=
$end
-
$start
;
my
$loh_size
= 0;
my
$loh_score
= 0;
if
(
( !
defined
(
$chromosome_cnv_breakpoints_hash
{
$chr
}) ) ||
( !
defined
(
$chromosome_loh_breakpoints_hash
{
$chr
}) )
){
$loh_score
= 0;
$loh_size
= -1;
$remaining_heterozygous_size
= -1;
return
(
$loh_score
,
$loh_size
,
$remaining_heterozygous_size
);
}
@cnv_breakpoints
= @{
$chromosome_cnv_breakpoints_hash
{
$chr
}};
@cnv_breakpoints
=
sort
{
$a
<=>
$b
}
@cnv_breakpoints
;
$cnv_breakpoints_size
=
@cnv_breakpoints
;
for
(
my
$i
= 0;
$i
<
$cnv_breakpoints_size
;
$i
+=2){
my
$cnv_start
=
$i
;
my
$cnv_end
=
$i
+1;
my
$end_overlap
= 0;
my
$start_overlap
= 0;
if
(
$cnv_breakpoints
[
$cnv_start
] >
$end
){
last
;
}
if
(
$cnv_breakpoints
[
$cnv_end
] >=
$start
&&
$cnv_breakpoints
[
$cnv_end
] <=
$end
){
$end_overlap
= 1;
}
if
(
$cnv_breakpoints
[
$cnv_start
] >=
$start
&&
$cnv_breakpoints
[
$cnv_start
] <=
$end
){
$start_overlap
= 1;
}
if
(
$start_overlap
==1 &&
$end_overlap
==1) {
$remaining_heterozygous_size
-= (
$cnv_breakpoints
[
$cnv_end
] -
$cnv_breakpoints
[
$cnv_start
]);
}
elsif
(
$start_overlap
==1){
$remaining_heterozygous_size
-= (
$end
-
$cnv_breakpoints
[
$cnv_start
]);
}
elsif
(
$end_overlap
==1){
$remaining_heterozygous_size
-= (
$cnv_breakpoints
[
$cnv_end
]-
$start
);
}
elsif
(
$cnv_breakpoints
[
$cnv_start
] <
$start
&&
$cnv_breakpoints
[
$cnv_end
] >
$end
){
$remaining_heterozygous_size
= 0;
}
}
if
(
$remaining_heterozygous_size
== 0 ||
$remaining_heterozygous_size
==
$original_heterozygous_size
){
$loh_score
= 0;
$loh_size
= -1;
$remaining_heterozygous_size
= -1;
return
(
$loh_score
,
$loh_size
,
$remaining_heterozygous_size
);
}
@loh_breakpoints
= @{
$chromosome_loh_breakpoints_hash
{
$chr
}};
@loh_breakpoints
=
sort
{
$a
<=>
$b
}
@loh_breakpoints
;
$loh_breakpoints_size
=
@loh_breakpoints
;
for
(
my
$i
= 0;
$i
<
$loh_breakpoints_size
;
$i
+=2){
my
$start_overlap_region_loh
= 0;
my
$end_overlap_region_loh
= 0;
my
$loh_start
=
$i
;
my
$loh_end
=
$i
+1;
my
$loh_start_breakpoint
=
$loh_breakpoints
[
$loh_start
];
my
$loh_end_breakpoint
=
$loh_breakpoints
[
$loh_end
];
my
$loh_region_size
;
if
(
$loh_breakpoints
[
$loh_start
] >
$end
){
last
;
}
if
(
$loh_breakpoints
[
$loh_end
] >=
$start
&&
$loh_breakpoints
[
$loh_end
] <=
$end
){
$end_overlap_region_loh
= 1;
}
if
(
$loh_breakpoints
[
$loh_start
] >=
$start
&&
$loh_breakpoints
[
$loh_start
] <=
$end
){
$start_overlap_region_loh
= 1;
}
if
(
$start_overlap_region_loh
==1 &&
$end_overlap_region_loh
!=1){
$loh_end_breakpoint
=
$end
;
}
elsif
(
$end_overlap_region_loh
==1 &&
$start_overlap_region_loh
!=1){
$loh_start_breakpoint
=
$start
;
}
elsif
(
$loh_breakpoints
[
$loh_start
] <
$start
&&
$loh_breakpoints
[
$loh_end
] >
$end
){
$loh_start_breakpoint
=
$start
;
$loh_end_breakpoint
=
$end
;
$start_overlap_region_loh
=1;
$end_overlap_region_loh
=1;
}
if
(
$start_overlap_region_loh
!= 1 &&
$end_overlap_region_loh
!= 1){
next
;
}
$loh_region_size
=
$loh_end_breakpoint
-
$loh_start_breakpoint
;
for
(
my
$k
= 0;
$k
<
$cnv_breakpoints_size
;
$k
+=2){
my
$start_overlap_loh_cnv
= 0;
my
$end_overlap_loh_cnv
= 0;
my
$cnv_start
=
$k
;
my
$cnv_end
=
$k
+1;
if
(
$cnv_breakpoints
[
$cnv_start
] >
$loh_end_breakpoint
){
last
;
}
if
(
$cnv_breakpoints
[
$cnv_end
] >=
$loh_start_breakpoint
&&
$cnv_breakpoints
[
$cnv_end
] <=
$loh_end_breakpoint
){
$end_overlap_loh_cnv
= 1;
}
if
(
$cnv_breakpoints
[
$cnv_start
] >=
$loh_start_breakpoint
&&
$cnv_breakpoints
[
$cnv_start
] <=
$loh_end_breakpoint
){
$start_overlap_loh_cnv
= 1;
}
if
(
$start_overlap_loh_cnv
==1 ||
$end_overlap_loh_cnv
==1) {
$loh_region_size
-= (
$cnv_breakpoints
[
$cnv_end
] -
$cnv_breakpoints
[
$cnv_start
]);
}
elsif
(
$start_overlap_loh_cnv
==1){
$loh_region_size
-= (
$loh_end_breakpoint
-
$cnv_breakpoints
[
$cnv_start
]);
}
elsif
(
$end_overlap_loh_cnv
==1){
$loh_region_size
-= (
$cnv_breakpoints
[
$cnv_end
]-
$loh_start_breakpoint
);
}
elsif
(
$cnv_breakpoints
[
$cnv_start
] <
$loh_start_breakpoint
&&
$cnv_breakpoints
[
$cnv_end
] >
$loh_end_breakpoint
){
$loh_region_size
= 0;
}
}
$loh_size
+=
$loh_region_size
;
}
$loh_score
= 1 - (
$loh_size
/
$remaining_heterozygous_size
);
if
(
$loh_size
>
$remaining_heterozygous_size
){
die
"ERROR: invalid LOH size value found\n"
;
}
return
(
$loh_score
,
$loh_size
,
$remaining_heterozygous_size
);
}
sub
standard_deviation_and_mean{
my
$data_ref
=
shift
;
my
$type
=
shift
;
my
%hash
;
my
@array
;
my
$size
;
my
$mean
= 0;
my
$SD
= 0;
if
(
$type
==0){
%hash
= %{
$data_ref
};
if
((
scalar
(
keys
%hash
))==0){
die
"Found sample size of 0 when calculating SD-hash\n"
;
}
for
my
$key
(
keys
%hash
){
$mean
+=
$hash
{
$key
};
}
$mean
=
$mean
/(
scalar
(
keys
%hash
));
for
my
$key
(
keys
%hash
){
$SD
+= (
$hash
{
$key
}-
$mean
)**2;
}
$SD
=
$SD
/(
scalar
(
keys
%hash
));
$SD
=
$SD
**0.5;
}
elsif
(
$type
==1){
@array
= @{
$data_ref
};
$size
=
@array
;
if
(
$size
==0){
die
"Found sample size of 0 when calculating SD-array\n"
;
}
foreach
my
$val
(
@array
){
$mean
+=
$val
;
}
$mean
=
$mean
/
$size
;
foreach
my
$val
(
@array
){
$SD
+= (
$val
-
$mean
)**2;
}
$SD
=
$SD
/
$size
;
$SD
=
$SD
**0.5;
}
else
{
die
"ERROR: invalid SD/mean type found\n"
;
}
return
(
$SD
,
$mean
);
}
sub
next_arg {
my
$code
=
shift
;
$pos
++;
if
(
$pos
==
$ARGC
){
usage(
$code
);
}
}
sub
man_text {
print
"Main Usage:\n"
;
print
"\tperl -w shatterproof.pl --cnv <dir> --trans <dir> [--insrt <dir>] [--loh <dir>] [--tp53] --config <path> --output <dir>\n"
;
print
"\n"
;
print
"\tArguments:\n"
;
print
"\t\t--cnv\t\tDefine the path to the directory containing the CNV input files\n"
;
print
"\t\t--trans\t\tDefine the path to the directory containing the Translocation input files\n"
;
print
"\t\t--insrt\t\tDefine the path to the directory containing the insertion VCF input files\n"
;
print
"\t\t--loh\t\tDefine the path to the directory containing the LOH input files\n"
;
print
"\t\t--tp53\t\tIndicate that TP53 should be considered mutated, regardless of data\n"
;
print
"\t\t--config\tDefine the path to the ShatterProof config file\n"
;
print
"\t\t--output\tDefine the path to the directory where output should be placed\n"
;
print
"\t\tdir\t\tPath to a directory\n"
;
print
"\t\tpath\t\tPath to a file\n"
;
print
"\n"
;
print
"Help Usage:\n"
;
print
"\tperl -w shatterproof.pl --help\t\tThis help message.\n"
;
print
"\n"
;
exit
0;
}
sub
usage {
my
$usage_msg
=
shift
;
print
"u $usage_msg \n"
;
switch(
$usage_msg
){
case 0 {
print
"ERROR: missing arguments\n"
; }
case 1 {
print
"ERROR: 2nd argument missing\n"
; }
case 2 {
print
"ERROR: CNV directory missing\n"
; }
case 3 {
print
"ERROR: --trans option missing\n"
}
case 4 {
print
"ERROR: --cnv option missing\n"
}
case 5 {
print
"ERROR: Translocation directory missing\n"
}
case 6 {
print
"ERROR: --config option missing\n"
}
case 7 {
print
"ERROR: --trans option missing\n"
}
case 8 {
print
"ERROR: insertion directory missing\n"
}
case 9 {
print
"ERROR: --config option missing\n"
}
case 10 {
print
"ERROR: LOH directory missing \n"
}
case 11 {
print
"ERROR: --config option missing\n"
}
case 12 {
print
"ERROR: --config option missing\n"
}
case 13 {
print
"ERROR: Path to config file missing\n"
}
case 14 {
print
"ERROR: --output option missing\n"
}
case 15 {
print
"ERROR: --config option missing\n"
}
case 16 {
print
"ERROR: Output directory missing\n"
}
case 17 {
print
"ERROR: --output option missing\n"
}
case 18 {
print
"ERROR: too many arguments\n"
}
}
print
"Try perl -w shatteproof.pl --help\n"
;
exit
0;
}
sub
initialize_genome_hash {
my
%genome_region_data
= (
X
=> [],
Y
=> [],
1
=> [],
2
=> [],
3
=> [],
4
=> [],
5
=> [],
6
=> [],
7
=> [],
8
=> [],
9
=> [],
10
=> [],
11
=> [],
12
=> [],
13
=> [],
14
=> [],
15
=> [],
16
=> [],
17
=> [],
18
=> [],
19
=> [],
20
=> [],
21
=> [],
22
=> []
);
return
(\
%genome_region_data
);
}
sub
load_config_file {
my
$path
=
shift
;
print
"\nLoading configuration file"
;
my
$CONFIG
;
open
(
$CONFIG
,
"<"
,
"$path"
) or
die
"COULD NOT OPEN CONFIG FILE at path: $path \n"
;
eval
(<
$CONFIG
>)
while
(!
eof
(
$CONFIG
));
close
(
$CONFIG
);
print
" - Done\n"
;
}
Hide Show 172 lines of Pod
1;