The Perl and Raku Conference 2025: Greenville, South Carolina - June 27-29 Learn more

#!/usr/bin/env perl
# ABSTRACT: Concatenate FASTA or FASTQ files
# PODNAME: fu-cat
use 5.012;
use warnings FATAL => 'all';
use FindBin qw($RealBin);
use Digest::MD5 qw(md5_base64);
# The following placeholder is to be programmatically replaced with 'use lib "$RealBin/../lib"' if needed
#~loclib~
if ( -e "$RealBin/../lib/Proch/N50.pm" and -e "$RealBin/../Changes" ) {
use lib "$RealBin/../lib";
}
my $VERSION = $Proch::Seqfu::VERSION // "<Dev>";
my $BASENAME = basename($0);
my ($opt_force_fasta, $opt_force_fastq, $opt_upper, $opt_help, $opt_verbose, $opt_version);
my ($opt_minlen, $opt_maxlen, $opt_dereplicate, $opt_rename_md5);
my $opt_separator = '.';
my $opt_rename_string;
my $opt_line_size = $Proch::Seqfu::fu_linesize;
if (!GetOptions(
'f|fasta' => \$opt_force_fasta,
'q|fastq' => \$opt_force_fastq,
'd|dereplicate' => \$opt_dereplicate,
'5|rename-md5' => \$opt_rename_md5,
'r|rename=s' => \$opt_rename_string,
's|separator=s' => \$opt_separator,
'w|width=i' => \$opt_line_size,
'u|uppercase' => \$opt_upper,
'l|minlen=i' => \$opt_minlen,
'm|maxlen=i' => \$opt_maxlen,
'version' => \$opt_version,
'verbose' => \$opt_verbose,
'help' => \$opt_help,
)) {
say STDERR "Wrong parameters: type $BASENAME --help for full documentation.";
exit 1 ;
}
$Proch::Seqfu::fu_linesize = $opt_line_size;
$Proch::Seqfu::fu_verbose = $opt_verbose;
# Check consistency of parameters
if ($opt_force_fasta and $opt_force_fastq){
die " PARAMETER ERROR:\n You must specify either --fasta or --fastq (or none), not both.\n";
}
# Print version
if ($opt_version) {
version();
}
# Print man (help)
pod2usage(
{-exitval => 0, -verbose => 2}
) if $opt_help;
# Read from STDIN
if (not defined $ARGV[0]) {
# Read from STDIN but also print a help message
usage();
push(@ARGV, '-');
}
# Autodetect output format if not specified
if (not $opt_force_fasta and not $opt_force_fastq){
# Prescan all the files ...
for my $filename (@ARGV) {
if ($filename eq '-' or $filename eq '{{STDIN}}') {
$opt_force_fasta = 1;
verbose("When requiring to parse STDIN, output format is defaulted to FASTA (unless --fastq is on)")
}
if (not defined $opt_force_fastq and not defined $opt_force_fasta) {
verbose "Autodetecting ";
my $fx_reader = FASTX::Reader->new({ filename => "$filename" });
if ($filename ne '-') {
my $format = $fx_reader->getFileFormat("$filename");
if ($format eq 'fasta') {
$opt_force_fasta = 1;
}
}
}
}
if (not $opt_force_fasta) {
verbose("Output format: FASTQ");
$opt_force_fastq = 1;
} else {
verbose("Output format: FASTA");
}
}
my %printed_seq_names = ();
my %printed_seqs = ();
# Process all the files
for my $filename (@ARGV) {
say STDERR " - $filename" if ($Proch::Seqfu::fu_verbose);
$filename = '{{STDIN}}' if ($filename eq '-');
my $reader = FASTX::Reader->new({ filename => "$filename"});
while (my $s = $reader->getRead() ) {
# Duplicate name finder
my $name = $s->{name};
if ($printed_seq_names{ $s->{name} }) {
$name .= $opt_separator . $printed_seq_names{ $s->{name} };
}
$printed_seq_names{ $s->{name} }++;
# Skip short sequences
if (defined $opt_minlen and length($s->{seq}) < $opt_minlen) {
next;
}
# Skip long sequences
if (defined $opt_maxlen and length($s->{seq}) > $opt_maxlen) {
next;
}
if ($opt_dereplicate) {
my $seq_sum = md5_base64($s->{seq});
$printed_seqs{$seq_sum}++;
next if ($printed_seqs{$seq_sum} > 1);
$name = $seq_sum if ($opt_rename_md5);
}
# Print seq
if ($opt_force_fasta) {
$s->{seq} = uc($s->{seq}) if ($opt_upper);
fu_printfasta($name, $s->{comment}, $s->{seq});
} elsif ($opt_force_fastq) {
if (not defined $s->{qual}) {
die " FATAL ERROR [$filename]:\n Trying to print sequence <$name> in FASTQ format, but no quality found\n";
}
fu_printfastq($name, $s->{comment}, $s->{seq}, $s->{qual});
}
}
}
sub usage {
my $horizontal_bar = " " . '-' x 50;
say STDERR " $BASENAME $VERSION";
say STDERR " A program to concatenate sequence files";
say STDERR $horizontal_bar;
say STDERR " Type \`$BASENAME --help\` to display the full manual";
say STDERR " Waiting for sequences from STDIN. Press Ctrl-C to exit."
}
sub version {
say $BASENAME, " ", $VERSION;
say STDERR "Using Proch::Seqfu=", $Proch::Seqfu::VERSION, " and FASTX::Reader=", $FASTX::Reader::VERSION;
exit();
}
;
__END__
=pod
=encoding UTF-8
=head1 NAME
fu-cat - Concatenate FASTA or FASTQ files
=head1 VERSION
version 1.7.0
=head1 SYNOPSIS
fu-cat [options] [FILE1 FILE2 FILE3...]
=head1 DESCRIPTION
This program parses a list of FASTA/FASTQ and will concatenate them
ensuring consistent output. Will rename duplicate sequence names.
Will try to autodetect the format of all files before executing and
decide accordingly the output format (FASTA if at least one of the
files is FASTA, otherwise FASTQ). If reading from STDIN the first
sequence is in FASTQ format, will skip all the sequences without a
quality string.
If no files are provided the program will try reading from STDIN,
otherwise add a '-' to the list of files to also read from STDIN.
=head1 NAME
fu-cat - concatenate FASTA or FASTQ files
=head1 PARAMETERS
=over 4
=item I<-s>, I<--separator>
When a second sequence with a name that was already printed is found,
the program will append a progressive number, separated by this string.
Use `fu-rename` if you need more options.
[default: "."]
=item I<-f>, I<--fasta>
Force FASTA output
=item I<-q>, I<--fastq>
Force FASTQ output. Will B<not> print any sequence without quality
(they will be skipped)
=item I<-d>, I<--dereplicate>
Print each sequence only only once
=item I<-5>, I<--rename-md5>
(use with -d) rename each sequence name with the MD5sum of the sequence.
=item I<-l>, I<--minlen> INT
Do not print sequences shorter (exclusive) than INT
=item I<-m>, I<--maxlen> INT
Do not print sequences longer (exclusive) than INT
=item I<-u>, I<--uppercase>
Will print the whole sequence in uppercase
=item I<-w>, I<--width> INT
Size of the FASTA lines. Specifing 0 will print the whole sequence in the same line (default: 0)
=item I<--verbose>
Print more details
=item I<--help>
Display this help page
=item I<--version>
Print version and exit
=back
=head1 MODERN ALTERNATIVE
This suite of tools has been superseded by B<SeqFu>, a compiled
program providing faster and safer tools for sequence analysis.
This suite is maintained for the higher portability of Perl scripts
under certain circumstances.
SeqFu is available at L<https://github.com/telatin/seqfu2>, and
can be installed with BioConda C<conda install -c bioconda seqfu>
=head1 CITING
Telatin A, Fariselli P, Birolo G.
I<SeqFu: A Suite of Utilities for the Robust and Reproducible Manipulation of Sequence Files>.
Bioengineering 2021, 8, 59. L<https://doi.org/10.3390/bioengineering8050059>
=head1 AUTHOR
Andrea Telatin <andrea@telatin.com>
=head1 COPYRIGHT AND LICENSE
This software is Copyright (c) 2018-2027 by Quadram Institute Bioscience.
This is free software, licensed under:
The MIT (X11) License
=cut