—#!/usr/bin/env perl
# ABSTRACT: Concatenate FASTA or FASTQ files
# PODNAME: fu-cat
use
5.012;
use
Getopt::Long;
use
Pod::Usage;
use
File::Basename;
use
FASTX::Reader;
# The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed
#~loclib~
if
( -e
"$RealBin/../lib/Proch/N50.pm"
and -e
"$RealBin/../Changes"
) {
}
use
Proch::Seqfu;
my
$VERSION
=
$Proch::Seqfu::VERSION
//
"<Dev>"
;
my
$BASENAME
= basename($0);
my
(
$opt_force_fasta
,
$opt_force_fastq
,
$opt_upper
,
$opt_help
,
$opt_verbose
,
$opt_version
);
my
(
$opt_minlen
,
$opt_maxlen
,
$opt_dereplicate
,
$opt_rename_md5
);
my
$opt_separator
=
'.'
;
my
$opt_rename_string
;
my
$opt_line_size
=
$Proch::Seqfu::fu_linesize
;
if
(!GetOptions(
'f|fasta'
=> \
$opt_force_fasta
,
'q|fastq'
=> \
$opt_force_fastq
,
'd|dereplicate'
=> \
$opt_dereplicate
,
'5|rename-md5'
=> \
$opt_rename_md5
,
'r|rename=s'
=> \
$opt_rename_string
,
's|separator=s'
=> \
$opt_separator
,
'w|width=i'
=> \
$opt_line_size
,
'u|uppercase'
=> \
$opt_upper
,
'l|minlen=i'
=> \
$opt_minlen
,
'm|maxlen=i'
=> \
$opt_maxlen
,
'version'
=> \
$opt_version
,
'verbose'
=> \
$opt_verbose
,
'help'
=> \
$opt_help
,
)) {
say
STDERR
"Wrong parameters: type $BASENAME --help for full documentation."
;
exit
1 ;
}
$Proch::Seqfu::fu_linesize
=
$opt_line_size
;
$Proch::Seqfu::fu_verbose
=
$opt_verbose
;
# Check consistency of parameters
if
(
$opt_force_fasta
and
$opt_force_fastq
){
die
" PARAMETER ERROR:\n You must specify either --fasta or --fastq (or none), not both.\n"
;
}
# Print version
if
(
$opt_version
) {
version();
}
# Print man (help)
pod2usage(
{
-exitval
=> 0,
-verbose
=> 2}
)
if
$opt_help
;
# Read from STDIN
if
(not
defined
$ARGV
[0]) {
# Read from STDIN but also print a help message
usage();
push
(
@ARGV
,
'-'
);
}
# Autodetect output format if not specified
if
(not
$opt_force_fasta
and not
$opt_force_fastq
){
# Prescan all the files ...
for
my
$filename
(
@ARGV
) {
if
(
$filename
eq
'-'
or
$filename
eq
'{{STDIN}}'
) {
$opt_force_fasta
= 1;
verbose(
"When requiring to parse STDIN, output format is defaulted to FASTA (unless --fastq is on)"
)
}
if
(not
defined
$opt_force_fastq
and not
defined
$opt_force_fasta
) {
verbose
"Autodetecting "
;
my
$fx_reader
= FASTX::Reader->new({
filename
=>
"$filename"
});
if
(
$filename
ne
'-'
) {
my
$format
=
$fx_reader
->getFileFormat(
"$filename"
);
if
(
$format
eq
'fasta'
) {
$opt_force_fasta
= 1;
}
}
}
}
if
(not
$opt_force_fasta
) {
verbose(
"Output format: FASTQ"
);
$opt_force_fastq
= 1;
}
else
{
verbose(
"Output format: FASTA"
);
}
}
my
%printed_seq_names
= ();
my
%printed_seqs
= ();
# Process all the files
for
my
$filename
(
@ARGV
) {
say
STDERR
" - $filename"
if
(
$Proch::Seqfu::fu_verbose
);
$filename
=
'{{STDIN}}'
if
(
$filename
eq
'-'
);
my
$reader
= FASTX::Reader->new({
filename
=>
"$filename"
});
while
(
my
$s
=
$reader
->getRead() ) {
# Duplicate name finder
my
$name
=
$s
->{name};
if
(
$printed_seq_names
{
$s
->{name} }) {
$name
.=
$opt_separator
.
$printed_seq_names
{
$s
->{name} };
}
$printed_seq_names
{
$s
->{name} }++;
# Skip short sequences
if
(
defined
$opt_minlen
and
length
(
$s
->{seq}) <
$opt_minlen
) {
next
;
}
# Skip long sequences
if
(
defined
$opt_maxlen
and
length
(
$s
->{seq}) >
$opt_maxlen
) {
next
;
}
if
(
$opt_dereplicate
) {
my
$seq_sum
= md5_base64(
$s
->{seq});
$printed_seqs
{
$seq_sum
}++;
next
if
(
$printed_seqs
{
$seq_sum
} > 1);
$name
=
$seq_sum
if
(
$opt_rename_md5
);
}
# Print seq
if
(
$opt_force_fasta
) {
$s
->{seq} =
uc
(
$s
->{seq})
if
(
$opt_upper
);
fu_printfasta(
$name
,
$s
->{comment},
$s
->{seq});
}
elsif
(
$opt_force_fastq
) {
if
(not
defined
$s
->{qual}) {
die
" FATAL ERROR [$filename]:\n Trying to print sequence <$name> in FASTQ format, but no quality found\n"
;
}
fu_printfastq(
$name
,
$s
->{comment},
$s
->{seq},
$s
->{qual});
}
}
}
sub
usage {
my
$horizontal_bar
=
" "
.
'-'
x 50;
say
STDERR
" $BASENAME $VERSION"
;
say
STDERR
" A program to concatenate sequence files"
;
say
STDERR
$horizontal_bar
;
say
STDERR
" Type \`$BASENAME --help\` to display the full manual"
;
say
STDERR
" Waiting for sequences from STDIN. Press Ctrl-C to exit."
}
sub
version {
say
$BASENAME
,
" "
,
$VERSION
;
say
STDERR
"Using Proch::Seqfu="
,
$Proch::Seqfu::VERSION
,
" and FASTX::Reader="
,
$FASTX::Reader::VERSION
;
exit
();
}
;
__END__
=pod
=encoding UTF-8
=head1 NAME
fu-cat - Concatenate FASTA or FASTQ files
=head1 VERSION
version 1.7.0
=head1 SYNOPSIS
fu-cat [options] [FILE1 FILE2 FILE3...]
=head1 DESCRIPTION
This program parses a list of FASTA/FASTQ and will concatenate them
ensuring consistent output. Will rename duplicate sequence names.
Will try to autodetect the format of all files before executing and
decide accordingly the output format (FASTA if at least one of the
files is FASTA, otherwise FASTQ). If reading from STDIN the first
sequence is in FASTQ format, will skip all the sequences without a
quality string.
If no files are provided the program will try reading from STDIN,
otherwise add a '-' to the list of files to also read from STDIN.
=head1 NAME
fu-cat - concatenate FASTA or FASTQ files
=head1 PARAMETERS
=over 4
=item I<-s>, I<--separator>
When a second sequence with a name that was already printed is found,
the program will append a progressive number, separated by this string.
Use `fu-rename` if you need more options.
[default: "."]
=item I<-f>, I<--fasta>
Force FASTA output
=item I<-q>, I<--fastq>
Force FASTQ output. Will B<not> print any sequence without quality
(they will be skipped)
=item I<-d>, I<--dereplicate>
Print each sequence only only once
=item I<-5>, I<--rename-md5>
(use with -d) rename each sequence name with the MD5sum of the sequence.
=item I<-l>, I<--minlen> INT
Do not print sequences shorter (exclusive) than INT
=item I<-m>, I<--maxlen> INT
Do not print sequences longer (exclusive) than INT
=item I<-u>, I<--uppercase>
Will print the whole sequence in uppercase
=item I<-w>, I<--width> INT
Size of the FASTA lines. Specifing 0 will print the whole sequence in the same line (default: 0)
=item I<--verbose>
Print more details
=item I<--help>
Display this help page
=item I<--version>
Print version and exit
=back
=head1 MODERN ALTERNATIVE
This suite of tools has been superseded by B<SeqFu>, a compiled
program providing faster and safer tools for sequence analysis.
This suite is maintained for the higher portability of Perl scripts
under certain circumstances.
SeqFu is available at L<https://github.com/telatin/seqfu2>, and
can be installed with BioConda C<conda install -c bioconda seqfu>
=head1 CITING
Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>
=head1 AUTHOR
Andrea Telatin <andrea@telatin.com>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2018-2027 by Quadram Institute Bioscience.
This is free software, licensed under:
The MIT (X11) License
=cut