package FASTX::Abi;
use 5.016;
use Carp qw(confess);
$FASTX::Abi::VERSION = '0.10.1';
#ABSTRACT: Read Sanger trace file (chromatograms) in FASTQ format. For traces called with I<hetero> option, the ambiguities will be split into two sequences to allow usage from NGS tools that usually do not understand IUPAC ambiguities.
our @valid_new_attributes = (
'filename', # *REQUIRED* input trace filepath
'trim_ends', # bool (default: 1) trim low quality ends
'wnd', # int (default: 16) sliding window for quality trim
'min_qual', # int (default: 22) threshold for low quality calls
'bad_bases', # int (default: 2) maximum number of low quality bases per window
'keep_abi', # bool (default: 0) import the Bio::Trace::ABIF object in FASTX::Abi (otherwise deleted after import)
);
our @valid_obj_attributes = (
'diff', # number of ambiguous bases
'diff_array', # array of ambiguous bases position
'sequence_name', # sequence name from filename
'instrument', # Instrument
'avg_peak_spacing', # Avg Peak Spacing in chromas
'version', # version chromatograms
'chromas', # Bio::Trace::ABIF object
'hetero', # ambiguity
'seq1', # Sequence 1 (non ambiguous, allele1)
'seq2', # Sequence 2 (non ambiguous, allele2)
'sequence', # Sequence, trimmed
'quality', # Quality, trimmed
'raw_sequence', # Raw sequence
'raw_quality', # Raw quality
'iso_seq', # Sequence are equal
'discard', # Low quality sequence
);
our %iupac = (
'R' => 'AG',
'Y' => 'CT',
'M' => 'CA',
'K' => 'TG',
'W' => 'TA',
'S' => 'CG'
);
sub new {
# Instantiate object
my ($class, $args) = @_;
my $self = {
filename => $args->{filename}, # Chromatogram file name
trim_ends => $args->{trim_ends}, # Trim low quality ends (bool)
min_qual => $args->{min_qual}, # Minimum quality
wnd => $args->{wnd}, # Window for end trimming
bad_bases => $args->{bad_bases}, # Number of low qual bases per $window_width
keep_abi => $args->{keep_abi}, # Do not destroy $self->{chromas} after use
};
#check valid inputs:
for my $input (sort keys %{ $args } ) {
if ( ! grep( /^$input$/, @valid_new_attributes ) ) {
confess("Method new() does not accept \"$input\" attribute. Valid attributes are:\n", join(', ', @valid_new_attributes));
}
}
# CHECK INPUT FILE
# -----------------------------------
if (not defined $self->{filename}) {
confess("ABI file must be provided when creating new object");
}
if (not -e $self->{filename}) {
confess("ABI file not found: ", $self->{filename});
}
my $abif;
my $try = eval
{
$abif = Bio::Trace::ABIF->new();
$abif->open_abif($self->{filename}) or confess "Error in file: ", $self->{filename};
1;
};
if (not $try) {
confess("Bio::Trace::ABIF was unable to read: ", $self->{filename});
}
my $object = bless $self, $class;
$object->{chromas} = $abif;
my @ext = ('.abi','.ab1','.ABI','.abI','.AB1','.ab');
my ($seqname) = basename($self->{filename}, @ext);
$object->{sequence_name} = $seqname;
# DEFAULTS
# -----------------------------------
$object->{trim_ends} = 1 unless defined $object->{trim_ends};
$object->{wnd} = 10 unless defined $object->{wnd};
$object->{min_qual} = 20 unless defined $object->{min_qual};
$object->{bad_bases} = 4 unless defined $object->{bad_bases};
$object->{keep_abi} = 0 unless defined $object->{keep_abi};
$object->{discard} = 0;
# GET SEQUENCE FROM AB1 FILE
# -----------------------------------
my $seq = _get_sequence($self);
if ($self->{keep_abi} == 0) {
$self->{chromas} = undef;
}
#check valid attributes:
for my $input (sort keys %{ $self} ) {
# [this is a developer's safety net]
# uncoverable condition false
if ( ! grep( /^$input$/, @valid_new_attributes, @valid_obj_attributes ) ) {
confess("Method new() does not accept \"$input\" attribute. Valid attributes are:\n", join(', ', @valid_new_attributes, @valid_obj_attributes));
}
}
return $object;
}
sub get_fastq {
my ($self, $name, $quality_value) = @_;
if (not defined $name) {
$name = $self->{sequence_name};
} elsif ($name=~/\s+/) {
$name =~s/\s+/_/g;
}
my $quality = $self->{quality};
if (defined $quality_value) {
if ($quality_value =~/^\d+$/ and $quality_value >= 10) {
my $q = chr(($quality_value <= 93 ? $quality_value : 93) + 33);
$quality = $q x length($quality);
} elsif (length($quality_value) == 1) {
$quality = $quality_value x length($quality);
} else {
confess("Supplied quality is neither a valid integer or a single char: <$quality_value>\n");
}
}
my $output = '';
if ( $self->{iso_seq} ) {
$output .= '@' . $name . "\n" .
$self->{seq1} . "\n+\n" .
$quality . "\n";
} else {
$output .= '@' . $name . "_1\n" .
$self->{seq1} . "\n+\n" .
$quality . "\n";
$output .= '@' . $name . "_2\n" .
$self->{seq2} . "\n+\n" .
$quality . "\n";
}
return $output;
}
sub get_trace_info {
my $self = shift;
my $data;
$data->{instrument} = $self->{instrument};
$data->{version} = $self->{version};
$data->{avg_peak_spacing} = $self->{avg_peak_spacing};
return $data;
}
sub _get_sequence {
my $self = shift;
my $abif = $self->{chromas};
$self->{raw_sequence} = $abif->sequence();
# Get quality values
my @qv = $abif->quality_values();
# Encode quality in FASTQ chars
my @fqv = map {chr(int(($_<=93? $_ : 93)*4/6) + 33)} @qv;
# FASTQ
my $q = join('', @fqv);
$self->{raw_quality} = $q;
$self->{sequence} = $self->{raw_sequence};
$self->{quality} = $self->{raw_quality};
# Trim
if ($self->{trim_ends}) {
#The Sequencing Analysis program determines the clear range of the sequence by trimming bases from the 5' to 3'
#ends until fewer than 4 bases out of 20 have a quality value less than 20.
#You can change these parameters by explicitly passing arguments to this method
#(the default values are $window_width = 20, $bad_bases_threshold = 4, $quality_threshold = 20).
# Note that Sequencing Analysis counts the bases starting from one, so you have to add one to the return values to get consistent results.
my ($begin_pos, $end_pos) = $abif->clear_range(
$self->{wnd},
$self->{bad_bases},
$self->{min_qual},
);
# This can be tested with low quality chromatograms
# *TODO* to ask for some bad trace
# uncoverable branch false
# uncoverable condition left
# uncoverable condition right
if ($begin_pos>0 and $end_pos>0) {
my $hi_qual_length = $end_pos-$begin_pos+1;
$self->{sequence} = substr($self->{sequence}, $begin_pos, $hi_qual_length);
$self->{quality} = substr($self->{quality} , $begin_pos, $hi_qual_length);
} else {
$self->{discard} = 1;
}
}
# Check hetero bases
if ($self->{sequence}!~/[ACGT][RYMKWS]+[ACGT]/i) {
$self->{hetero} = 0;
} else {
$self->{hetero} = 1;
}
# Check
$self->{diff_array} = ();
$self->{diff} = 0;
my $seq1 = '';
my $seq2 = '';
for (my $i = 0; $i<length($self->{sequence}); $i++) {
my $q0 = substr($self->{quality}, $i, 1);
my $s0 = substr($self->{sequence}, $i,1);
# Ambiguity detected:
if ($iupac{$s0}) {
my ($base1, $base2) = split //, $iupac{$s0};
$seq1.=$base1;
$seq2.=$base2;
$self->{diff}++;
push(@{ $self->{diff_array} }, $i);
} else {
$seq1.=$s0;
$seq2.=$s0;
}
}
$self->{seq1} = $seq1;
$self->{seq2} = $seq2;
if ($seq1 eq $seq2) {
$self->{iso_seq} = 1
} else {
$self->{iso_seq} = 0;
}
$self->{instrument} = $self->{chromas}->official_instrument_name();
$self->{version} = $self->{chromas}->abif_version();
$self->{avg_peak_spacing} = $self->{chromas}->avg_peak_spacing();
}
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
FASTX::Abi - Read Sanger trace file (chromatograms) in FASTQ format. For traces called with I<hetero> option, the ambiguities will be split into two sequences to allow usage from NGS tools that usually do not understand IUPAC ambiguities.
=head1 VERSION
version 0.10.1
=head1 SYNOPSIS
use FASTX::Abi;
my $trace_fastq = FASTX::Abi->new({ filename => '/path/to/trace.ab1' });
# Print chromatogram as FASTQ (will print two sequences if there are ambiguities)
print $trace_fastq->get_fastq();
=head1 BUILD STATUS
=for html <p><a href="https://travis-ci.org/telatin/FASTX-Abi" title="Test report"><img alt="TravisCI tests badge" src="https://travis-ci.org/telatin/FASTX-Abi.svg?branch=master"></a></p>
The source from GitHub is tested using Travis-CI for continuous integration.
Check also the CPAN grid test for a better estimate of
build success using CPAN version of interest.
=head1 INSTALLATION
Via B<cpanminus>:
# Install cpanminus if you don't have it:
curl -L https://cpanmin.us | perl - --sudo App::cpanminus
# Install FASTX::Abi
cpanm FASTX::Abi
conda install -y -c bioconda perl-fastx-abi
=head1 HETERO CALLING (IUPAC AMBIGUITIES)
When Sanger-sequencing a mix of molecules (i.e. PCR product from heterozigous genome) containing a single-base polimorphisms,
B<if> the I<.ab1> file is called using the I<hetero modality> the sequence stored in the file will contain ambiguous bases (i.e. using DNA IUPAC characters).
This module is designed to produce NGS-compatible FASTQ, so when ambiguous bases are detected the two "alleles" will be split into two sequences
(of course, if more SNPs are present in the same trace, the output I<cannot> be phased).
The image below shows a trace file (.ab1) with a valid variant and a low quality end.
=for HTML <p><img src="https://raw.githubusercontent.com/telatin/FASTX-Abi/master/img/chromatogram.png" alt="Sanger trace AB1" /></p>
=head1 METHODS
=head2 new()
When creating a new object the only B<required> argument is I<filename>.
# Trimming is based on Bio::Trace::ABIF->clear_range()
my $trace_fastq = FASTX::Abi->new({
filename => "$filepath",
min_qual => 22,
wnd => 16,
bad_bases => 2,
keep_abi => 1, # keep Bio::Trace::ABIF object in $self->{chromas} after use
});
# Raw sequence and quality:
print "Raw seq/qual: ", $trace_fastq->{raw_sequence}, ", ", $trace_fastq->{raw_quality}, "\n";
# Trimmed sequence and quality:
print "Seq/qual: ", $trace_fastq->{sequence}, ", ", $trace_fastq->{quality}, "\n";
# If there are ambiguities (hetero bases, IUPAC):
if ($trace_fastq->{diff} > 0 ) {
print "Differences: ", join(',', @{ $trace_fastq->{diffs} }), "\n";
print "Seq 'A': ", $trace_fastq->{seq1}, "\n";
print "Seq 'B': ", $trace_fastq->{seq2}, "\n";
}
Input parameters:
=over 4
=item I<filename>, path
Name of the trace file (AB1 format)
=item I<trim_ends>, bool
Trim low quality ends (true by default, highly recommended)
=item I<min_qual>, int
Minimum quality value for trimming
=item I<wnd>, int
Window size for end trimming
=item I<bad_bases>, int
Maximum number of bad bases per window
=back
=head2 B<get_fastq($sequence_name, $fixed_quality)>
Return a string with the FASTQ formatted sequence (if no ambiguities) or two
sequences (if at least one ambiguity is found).
If no C<$sequence_name> is provided, the header will be made from the AB1 filename. If C<$sequence_name> is defined and contains spaces,
they will converted to underscores.
The C<$fixed_quality> is a user provided fixed quality value for each base printed. Can be an integer (10 < x < 40), or a single char.
In the first case it will be encoded as quality score (values above 93 will all be rendered as C<~>), in the second case the character
will be used as quality score. If not supplied the original
quality of the chromatogram will be used (that B<will be very low in SNPs positions>).
# Use 40 as quality for each base of the trace:
$trace->get_fastq(undef, 40);
=head2 get_trace_info()
Returns an object with trace information:
my $info = FASTX::Abi->get_trace_info();
print "Instrument: ", $info->{instrument}, "\n";
print "Version: ", $info->{version}, "\n";
print "Average peak distance: ", $info->{avg_peak_spacing}, "\n";
=head2 _get_sequence()
Internal routine (called by B<new()>) to populate sequence and quality.
See new()
=head1 SEE ALSO
This module is a wrapper around L<Bio::Trace::ABIF> by Nicola Vitacolonna.
=head1 AUTHOR
Andrea Telatin <andrea@telatin.com>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2019 by Andrea Telatin.
This is free software, licensed under:
The MIT (X11) License
=cut