lib/FASTX/Abi.pm - metacpan.org


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
—
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
              package FASTX::Abi;
use 5.016;
use warnings;
use Carp qw(confess);
use Bio::Trace::ABIF;
use Data::Dumper;
use File::Basename;
$FASTX::Abi::VERSION = '0.10.1';
#ABSTRACT: Read Sanger trace file (chromatograms) in FASTQ format. For traces called with I<hetero> option, the ambiguities will be split into two sequences to allow usage from NGS tools that usually do not understand IUPAC ambiguities.
our @valid_new_attributes = (
  'filename',   # *REQUIRED* input trace filepath 
  'trim_ends',  # bool (default: 1) trim low quality ends
  'wnd',        # int (default: 16) sliding window for quality trim
  'min_qual',   # int (default: 22) threshold for low quality calls
  'bad_bases',  # int (default: 2)  maximum number of low quality bases per window
  'keep_abi',   # bool (default: 0) import the Bio::Trace::ABIF object in FASTX::Abi (otherwise deleted after import)
);
our @valid_obj_attributes = (
  'diff',             # number of ambiguous bases
  'diff_array',       # array of ambiguous bases position
  'sequence_name',    # sequence name from filename
  'instrument',       # Instrument
  'avg_peak_spacing', # Avg Peak Spacing in chromas
  'version',          # version chromatograms
  'chromas',          # Bio::Trace::ABIF object
  'hetero',           # ambiguity
  'seq1',             # Sequence 1 (non ambiguous, allele1)
  'seq2',             # Sequence 2 (non ambiguous, allele2)
  'sequence',         # Sequence, trimmed
  'quality',          # Quality, trimmed
  'raw_sequence',     # Raw sequence
  'raw_quality',      # Raw quality
  'iso_seq',          # Sequence are equal
  'discard',          # Low quality sequence
);
our %iupac = (
 'R' => 'AG',
 'Y' => 'CT',
 'M' => 'CA',
 'K' => 'TG',
 'W' => 'TA',
 'S' => 'CG'
);
sub new {
    # Instantiate object
    my ($class, $args) = @_;
    my $self = {
        filename  => $args->{filename},   # Chromatogram file name
        trim_ends => $args->{trim_ends},  # Trim low quality ends (bool)
        min_qual  => $args->{min_qual},   # Minimum quality
        wnd       => $args->{wnd},        # Window for end trimming
        bad_bases => $args->{bad_bases},  # Number of low qual bases per $window_width
        keep_abi  => $args->{keep_abi},   # Do not destroy $self->{chromas} after use
    };
    #check valid inputs:
    for my $input (sort keys %{ $args } ) {
      if ( ! grep( /^$input$/, @valid_new_attributes ) ) {
        confess("Method new() does not accept \"$input\" attribute. Valid attributes are:\n", join(', ', @valid_new_attributes));
      }
    }
    # CHECK INPUT FILE
    # -----------------------------------
    if (not defined $self->{filename}) {
      confess("ABI file must be provided when creating new object");
    }
    if (not -e $self->{filename}) {
      confess("ABI file not found: ", $self->{filename});
    }
    my $abif;
    my $try = eval
    {
      $abif = Bio::Trace::ABIF->new();
      $abif->open_abif($self->{filename}) or confess "Error in file: ", $self->{filename};
      1;
    };
    if (not $try) {
      confess("Bio::Trace::ABIF was unable to read: ", $self->{filename});
    }
    my $object = bless $self, $class;
    $object->{chromas} = $abif;
    my @ext = ('.abi','.ab1','.ABI','.abI','.AB1','.ab');
    my ($seqname) = basename($self->{filename}, @ext);
    $object->{sequence_name} = $seqname;
    # DEFAULTS
    # -----------------------------------
    $object->{trim_ends} = 1  unless defined $object->{trim_ends};
    $object->{wnd}       = 10 unless defined $object->{wnd};
    $object->{min_qual}  = 20 unless defined $object->{min_qual};
    $object->{bad_bases} = 4  unless defined $object->{bad_bases};
    $object->{keep_abi}  = 0  unless defined $object->{keep_abi};
    $object->{discard}   = 0;
    # GET SEQUENCE FROM AB1 FILE
    # -----------------------------------
    my $seq = _get_sequence($self);
    if ($self->{keep_abi} == 0) {
      $self->{chromas} = undef;
    }
    #check valid attributes:
    for my $input (sort keys %{ $self} ) {
      # [this is a developer's safety net]
      # uncoverable condition false
      if ( ! grep( /^$input$/, @valid_new_attributes, @valid_obj_attributes ) ) {
        confess("Method new() does not accept \"$input\" attribute. Valid attributes are:\n", join(', ', @valid_new_attributes, @valid_obj_attributes));
      }
    }
    return $object;
}
sub get_fastq {
  my ($self, $name, $quality_value) = @_;
  if (not defined $name) {
    $name = $self->{sequence_name};
  } elsif ($name=~/\s+/) {
    $name =~s/\s+/_/g;
  }
  my $quality = $self->{quality};
  if (defined $quality_value) {
    if ($quality_value =~/^\d+$/ and $quality_value >= 10) {
      my $q = chr(($quality_value <= 93 ? $quality_value : 93) + 33);
      $quality = $q x length($quality);
    } elsif (length($quality_value) == 1) {
      $quality = $quality_value x length($quality);
    } else {
      confess("Supplied quality is neither a valid integer or a single char: <$quality_value>\n");
    }
  }
  my $output = '';
  if ( $self->{iso_seq} ) {
    $output .= '@' . $name . "\n" .
                $self->{seq1} . "\n+\n" .
                $quality . "\n";
  } else {
    $output .= '@' . $name . "_1\n" .
                $self->{seq1} . "\n+\n" .
                $quality . "\n";
    $output .= '@' . $name . "_2\n" .
                $self->{seq2} . "\n+\n" .
                $quality . "\n";
  }
  return $output;
}
sub get_trace_info {
  my $self   = shift;
  my $data;
  $data->{instrument} = $self->{instrument};
  $data->{version}    = $self->{version};
  $data->{avg_peak_spacing} = $self->{avg_peak_spacing};
  return $data;
}
sub _get_sequence {
    my $self   = shift;
    my $abif = $self->{chromas};
    $self->{raw_sequence}  = $abif->sequence();
    # Get quality values
    my @qv       = $abif->quality_values();
    # Encode quality in FASTQ chars
    my @fqv      = map {chr(int(($_<=93? $_ : 93)*4/6) + 33)} @qv;
    # FASTQ
    my $q = join('', @fqv);
    $self->{raw_quality} = $q;
    $self->{sequence} = $self->{raw_sequence};
    $self->{quality}  = $self->{raw_quality};
    # Trim
    if ($self->{trim_ends}) {
        #The Sequencing Analysis program determines the clear range of the sequence by trimming bases from the 5' to 3'
        #ends until fewer than 4 bases out of 20 have a quality value less than 20.
        #You can change these parameters by explicitly passing arguments to this method
        #(the default values are $window_width = 20, $bad_bases_threshold = 4, $quality_threshold = 20).
        # Note that Sequencing Analysis counts the bases starting from one, so you have to add one to the return values to get consistent results.
        my ($begin_pos, $end_pos) = $abif->clear_range(
                                    $self->{wnd},
                                    $self->{bad_bases},
                                    $self->{min_qual},
                                   );
# This can be tested with low quality chromatograms 
# *TODO* to ask for some bad trace
# uncoverable branch false
# uncoverable condition left
# uncoverable condition right
             if ($begin_pos>0 and $end_pos>0) {
                my $hi_qual_length = $end_pos-$begin_pos+1;
                $self->{sequence} = substr($self->{sequence}, $begin_pos, $hi_qual_length);
                $self->{quality}  = substr($self->{quality} , $begin_pos, $hi_qual_length);
             } else {
                $self->{discard} = 1;
             }
    }
    # Check hetero bases
    if ($self->{sequence}!~/[ACGT][RYMKWS]+[ACGT]/i) {
        $self->{hetero} = 0;
    } else {
        $self->{hetero} = 1;
    }
    # Check
    $self->{diff_array} = ();
    $self->{diff} = 0;
    my $seq1 = '';
    my $seq2 = '';
    for (my $i = 0; $i<length($self->{sequence}); $i++) {
      my $q0 = substr($self->{quality}, $i, 1);
      my $s0 = substr($self->{sequence}, $i,1);
      # Ambiguity detected:
      if ($iupac{$s0}) {
        my ($base1, $base2) = split //, $iupac{$s0};
        $seq1.=$base1;
        $seq2.=$base2;
        $self->{diff}++;
        push(@{ $self->{diff_array} }, $i);
      } else {
        $seq1.=$s0;
        $seq2.=$s0;
      }
    }
    $self->{seq1} = $seq1;
    $self->{seq2} = $seq2;
    if ($seq1 eq $seq2) {
       $self->{iso_seq} = 1
     } else {
       $self->{iso_seq} = 0;
     }
     $self->{instrument} = $self->{chromas}->official_instrument_name();
     $self->{version}    = $self->{chromas}->abif_version();
     $self->{avg_peak_spacing} = $self->{chromas}->avg_peak_spacing();
}
1;
__END__
=pod
=encoding UTF-8
=head1 NAME
FASTX::Abi - Read Sanger trace file (chromatograms) in FASTQ format. For traces called with I<hetero> option, the ambiguities will be split into two sequences to allow usage from NGS tools that usually do not understand IUPAC ambiguities.
=head1 VERSION
version 0.10.1
=head1 SYNOPSIS
  use FASTX::Abi;
  my $trace_fastq = FASTX::Abi->new({ filename => '/path/to/trace.ab1' });
  # Print chromatogram as FASTQ (will print two sequences if there are ambiguities)
  print $trace_fastq->get_fastq();
=head1 BUILD STATUS
=for html <p><a href="https://travis-ci.org/telatin/FASTX-Abi" title="Test report"><img alt="TravisCI tests badge" src="https://travis-ci.org/telatin/FASTX-Abi.svg?branch=master"></a></p>
The source from GitHub is tested using Travis-CI for continuous integration. 
Check also the CPAN grid test for a better estimate of 
build success using CPAN version of interest. 
=head1 INSTALLATION
Via B<cpanminus>:
    # Install cpanminus if you don't have it:
    curl -L https://cpanmin.us | perl - --sudo App::cpanminus
    # Install FASTX::Abi
    cpanm FASTX::Abi
Via Miniconda L<https://docs.conda.io/en/latest/miniconda.html>:
    conda install -y -c bioconda perl-fastx-abi
=head1 HETERO CALLING (IUPAC AMBIGUITIES)
When Sanger-sequencing a mix of molecules (i.e. PCR product from heterozigous genome) containing a single-base polimorphisms,
B<if> the I<.ab1> file is called using the I<hetero modality> the sequence stored in the file will contain ambiguous bases (i.e. using DNA IUPAC characters).
This module is designed to produce NGS-compatible FASTQ, so when ambiguous bases are detected the two "alleles" will be split into two sequences
(of course, if more SNPs are present in the same trace, the output I<cannot> be phased).
The image below shows a trace file (.ab1) with a valid variant and a low quality end.
=for HTML <p><img src="https://raw.githubusercontent.com/telatin/FASTX-Abi/master/img/chromatogram.png" alt="Sanger trace AB1" /></p>
=head1 METHODS
=head2 new()
When creating a new object the only B<required> argument is I<filename>.
  # Trimming is based on Bio::Trace::ABIF->clear_range()
  my $trace_fastq = FASTX::Abi->new({
    filename   => "$filepath",
    min_qual   => 22,
    wnd        => 16,
    bad_bases  => 2,
    keep_abi   => 1,      # keep Bio::Trace::ABIF object in $self->{chromas} after use
  });
  # Raw sequence and quality:
  print "Raw seq/qual: ", $trace_fastq->{raw_sequence}, ", ", $trace_fastq->{raw_quality}, "\n";
  # Trimmed sequence and quality:
  print "Seq/qual: ", $trace_fastq->{sequence}, ", ", $trace_fastq->{quality}, "\n";
  # If there are ambiguities (hetero bases, IUPAC):
  if ($trace_fastq->{diff} > 0 ) {
    print "Differences: ", join(',', @{ $trace_fastq->{diffs} }), "\n";
    print "Seq 'A': ", $trace_fastq->{seq1}, "\n";
    print "Seq 'B': ", $trace_fastq->{seq2}, "\n";
  }
Input parameters:
=over 4
=item I<filename>, path
Name of the trace file (AB1 format)
=item I<trim_ends>, bool
Trim low quality ends (true by default, highly recommended)
=item I<min_qual>, int
Minimum quality value for trimming
=item I<wnd>, int
Window size for end trimming
=item I<bad_bases>, int
Maximum number of bad bases per window
=back
=head2 B<get_fastq($sequence_name, $fixed_quality)>
Return a string with the FASTQ formatted sequence (if no ambiguities) or two
sequences (if at least one ambiguity is found).
If no C<$sequence_name> is provided, the header will be made from the AB1 filename. If C<$sequence_name> is defined and contains spaces,
they will converted to underscores.
The C<$fixed_quality> is a user provided fixed quality value for each base printed. Can be an integer (10 < x < 40), or a single char.
In the first case it will be encoded as quality score (values above 93 will all be rendered as C<~>), in the second case the character 
will be used as quality score. If not supplied the original
quality of the chromatogram will be used (that B<will be very low in SNPs positions>). 
  # Use 40 as quality for each base of the trace:
  $trace->get_fastq(undef, 40);
=head2 get_trace_info()
Returns an object with trace information:
  my $info = FASTX::Abi->get_trace_info();
  print "Instrument:            ", $info->{instrument}, "\n";
  print "Version:               ", $info->{version}, "\n";
  print "Average peak distance: ", $info->{avg_peak_spacing}, "\n";
=head2 _get_sequence()
Internal routine (called by B<new()>) to populate sequence and quality.
See new()
=head1 SEE ALSO
This module is a wrapper around L<Bio::Trace::ABIF> by Nicola Vitacolonna.
=head1 AUTHOR
Andrea Telatin <andrea@telatin.com>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2019 by Andrea Telatin.
This is free software, licensed under:
  The MIT (X11) License
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)